1 #include "oem_event_manager.hpp"
2
3 #include "libcper/Cper.h"
4
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <systemd/sd-journal.h>
12
13 #include <com/ampere/Event/ReportedSEL/event.hpp>
14 #include <phosphor-logging/commit.hpp>
15 #include <phosphor-logging/lg2.hpp>
16 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
17
18 #include <algorithm>
19 #include <map>
20 #include <set>
21 #include <sstream>
22 #include <string>
23 #include <unordered_map>
24
25 namespace pldm
26 {
27 namespace oem_ampere
28 {
29 namespace fs = std::filesystem;
30 using namespace std::chrono;
31 namespace ReportedErrorSEL = sdbusplus::error::com::ampere::event::ReportedSEL;
32 namespace ReportedEventSEL = sdbusplus::event::com::ampere::event::ReportedSEL;
33
34 namespace boot_stage = boot::stage;
35 namespace ddr_status = ddr::status;
36 namespace dimm_status = dimm::status;
37 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
38 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
39 namespace training_failure = dimm::training_failure;
40
41 constexpr const char* BIOSFWPanicRegistry =
42 "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47
48 /*
49 An array of possible boot status of a boot stage.
50 The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53
54 /*
55 An array of possible boot status of DDR training stage.
56 The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59 " progress started", " in-progress", " progress completed"};
60
61 /*
62 A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65 "Below 85°C", "85°C", "95°C", "105°C",
66 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
67
68 /*
69 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70 EPs through SMBus and PCIe. When host boots up, SMBUS interface
71 comes up first. In this interface, BMC is bus owner.
72
73 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77
78 /*
79 A map between sensor IDs and their names in string.
80 Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83 {DDR_STATUS, "DDR_STATUS"},
84 {PCP_VR_STATE, "PCP_VR_STATE"},
85 {SOC_VR_STATE, "SOC_VR_STATE"},
86 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88 {D2D_VR_STATE, "D2D_VR_STATE"},
89 {IOC_VR1_STATE, "IOC_VR1_STATE"},
90 {IOC_VR2_STATE, "IOC_VR2_STATE"},
91 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94 {BOOT_OVERALL, "BOOT_OVERALL"},
95 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96 {WATCH_DOG, "WATCH_DOG"}};
97
98 /*
99 A map between the boot stages and logging strings.
100 Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103 {boot_stage::SECPRO, "SECpro"},
104 {boot_stage::MPRO, "Mpro"},
105 {boot_stage::ATF_BL1, "ATF BL1"},
106 {boot_stage::ATF_BL2, "ATF BL2"},
107 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108 {boot_stage::DDR_TRAINING, "DDR training"},
109 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110 {boot_stage::ATF_BL31, "ATF BL31"},
111 {boot_stage::ATF_BL32, "ATF BL32"},
112 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114 "ATF BL33 (UEFI) booting status = "}};
115
116 /*
117 A map between DDR status and logging strings.
118 Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125 {ddr_status::OTHER_FAILURE, "has other failure"},
126 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127 "has boot failure due to no configuration"},
128 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129 "failsafe activated but boot success with the next valid configuration"}};
130
131 /*
132 A map between DIMM status and logging strings.
133 Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137 {dimm_status::NOT_INSTALLED, "is not installed"},
138 {dimm_status::OTHER_FAILURE, "has other failure"},
139 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140 {dimm_status::TRAINING_FAILURE, "has training failure; "},
141 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142
143 /*
144 A map between PHY training failure syndrome and logging strings.
145 Using
146 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149 {phy_syndrome::NA, "(N/A)"},
150 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153 "(PHY write level failure - see syndrome 1)"},
154 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155 "(PHY read gate leveling failure)"},
156 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159
160 /*
161 A map between DIMM training failure syndrome and logging strings.
162 Using
163 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166 {dimm_syndrome::NA, "(N/A)"},
167 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168 "(DRAM VREFDQ training failure)"},
169 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171 "(LRDRIMM DB SW training failure)"}};
172
173 /*
174 A map between DIMM training failure type and a pair of <logging strings -
175 syndrome map>. Using
176 pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179 dimmTrainingFailureTypeMap = {
180 {training_failure::PHY_TRAINING_FAILURE_TYPE,
181 std::make_pair("PHY training failure",
182 phyTrainingFailureSyndromeToMsgMap)},
183 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184 std::make_pair("DIMM training failure",
185 dimmTrainingFailureSyndromeToMsgMap)}};
186
187 std::unordered_map<
188 uint16_t,
189 std::vector<std::pair<
190 std::string,
191 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
192 stateSensorToMsgMap = {
193 {SOC_HEALTH_AVAILABILITY,
194 {{"SoC Health",
195 {{1, {log_level::OK, "Normal"}},
196 {2, {log_level::WARNING, "Non-Critical"}},
197 {3, {log_level::CRITICAL, "Critical"}},
198 {4, {log_level::CRITICAL, "Fatal"}}}},
199 {"SoC Availability",
200 {{1, {log_level::OK, "Enabled"}},
201 {2, {log_level::WARNING, "Disabled"}},
202 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
203 {WATCH_DOG,
204 {{"Global Watch Dog",
205 {{1, {log_level::OK, "Normal"}},
206 {2, {log_level::CRITICAL, "Timer Expired"}}}},
207 {"Secure Watch Dog",
208 {{1, {log_level::OK, "Normal"}},
209 {2, {log_level::CRITICAL, "Timer Expired"}}}},
210 {"Non-secure Watch Dog",
211 {{1, {log_level::OK, "Normal"}},
212 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
213
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)214 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
215 uint16_t sensorId)
216 {
217 std::string description;
218
219 if (!sensorIdToStrMap.contains(sensorId))
220 {
221 description += "Sensor ID " + std::to_string(sensorId) + " of ";
222 }
223 else
224 {
225 description += "Sensor " + sensorIdToStrMap[sensorId] + " of ";
226 }
227
228 if (!tidToSocketNameMap.contains(tid))
229 {
230 description += "TID " + std::to_string(tid);
231 }
232 else
233 {
234 description += tidToSocketNameMap[tid];
235 }
236
237 return description;
238 }
239
sendJournalRedfish(const std::string & source,const std::string & description,log_level & logLevel)240 void OemEventManager::sendJournalRedfish(const std::string& source,
241 const std::string& description,
242 log_level& logLevel)
243 {
244 if (description.empty())
245 {
246 return;
247 }
248
249 switch (logLevel)
250 {
251 case log_level::OK:
252 lg2::commit(ReportedEventSEL::ReportedSELInfo(
253 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
254 break;
255 case log_level::WARNING:
256 lg2::commit(ReportedErrorSEL::ReportedSELWarning(
257 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
258 break;
259 case log_level::CRITICAL:
260 lg2::commit(ReportedErrorSEL::ReportedSELCritical(
261 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
262 break;
263 case log_level::BIOSFWPANIC:
264 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265 BIOSFWPanicRegistry, "REDFISH_MESSAGE_ARGS", description);
266 break;
267 default:
268 {
269 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
270 "DES", description);
271 return;
272 }
273 }
274 }
275
dimmIdxsToString(uint32_t dimmIdxs)276 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
277 {
278 std::string description;
279 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
280 {
281 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
282 {
283 description += " #" + std::to_string(bitIdx);
284 }
285 }
286 return description;
287 }
288
sensorIdToDIMMIdx(const uint16_t & sensorId)289 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
290 {
291 uint8_t dimmIdx = maxDIMMInstantNum;
292 int sensorId_Off = sensorId - 4;
293 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
294 ((sensorId_Off / 2) < maxDIMMInstantNum))
295 {
296 dimmIdx = sensorId_Off / 2;
297 }
298 return dimmIdx;
299 }
300
handleBootOverallEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)301 void OemEventManager::handleBootOverallEvent(pldm_tid_t tid, uint16_t sensorId,
302 uint32_t presentReading)
303 {
304 log_level logLevel{log_level::OK};
305 std::string description;
306 std::string source;
307 std::stringstream strStream;
308
309 uint8_t byte0 = (presentReading & 0x000000ff);
310 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
311 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
312 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
313 /*
314 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
315 * ATF BL32 and DDR initialization
316 */
317 if (bootStageToMsgMap.contains(byte3))
318 {
319 // Boot stage adding
320 description += bootStageToMsgMap[byte3];
321
322 switch (byte3)
323 {
324 case boot_stage::DDR_TRAINING:
325 if (byte0 >= ddrTrainingMsg.size())
326 {
327 logLevel = log_level::BIOSFWPANIC;
328 description += " unknown status";
329 }
330 else
331 {
332 description += ddrTrainingMsg[byte0];
333 }
334 if (0x01 == byte0)
335 {
336 // Add complete percentage
337 description += " at " + std::to_string(byte1) + "%";
338 }
339 break;
340 case boot_stage::S0_DDR_TRAINING_FAILURE:
341 case boot_stage::S1_DDR_TRAINING_FAILURE:
342 // ddr_training_status_msg()
343 logLevel = log_level::BIOSFWPANIC;
344 description += " at DIMMs:";
345 // dimmIdxs = presentReading & 0x00ffffff;
346 description += dimmIdxsToString(presentReading & 0x00ffffff);
347 description += " of socket ";
348 description +=
349 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
350 break;
351 default:
352 if (byte0 >= bootStatMsg.size())
353 {
354 logLevel = log_level::BIOSFWPANIC;
355 description += " unknown status";
356 }
357 else
358 {
359 description += bootStatMsg[byte0];
360 }
361 break;
362 }
363
364 // Sensor report action is fail
365 if (boot::status::BOOT_STATUS_FAILURE == byte2)
366 {
367 logLevel = log_level::BIOSFWPANIC;
368 }
369 }
370 else
371 {
372 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
373 {
374 description +=
375 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
376
377 strStream
378 << "Segment (0x" << std::setfill('0') << std::hex
379 << std::setw(8) << static_cast<uint32_t>(presentReading)
380 << "); Status Class (0x" << std::setw(2)
381 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
382 << std::setw(2) << static_cast<uint32_t>(byte2)
383 << "); Operation Code (0x" << std::setw(4)
384 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
385 << ")" << std::dec;
386
387 description += strStream.str();
388 }
389 }
390
391 source = prefixMsgStrCreation(tid, sensorId);
392 // Log to Redfish event
393 sendJournalRedfish(source, description, logLevel);
394 }
395
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)396 int OemEventManager::processNumericSensorEvent(
397 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
398 size_t sensorDataLength)
399 {
400 uint8_t eventState = 0;
401 uint8_t previousEventState = 0;
402 uint8_t sensorDataSize = 0;
403 uint32_t presentReading;
404 auto rc = decode_numeric_sensor_data(
405 sensorData, sensorDataLength, &eventState, &previousEventState,
406 &sensorDataSize, &presentReading);
407 if (rc)
408 {
409 lg2::error(
410 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
411 "TID", tid, "RC", rc);
412 return rc;
413 }
414
415 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
416 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
417 {
418 handleDIMMStatusEvent(tid, sensorId, presentReading);
419 return PLDM_SUCCESS;
420 }
421
422 switch (sensorId)
423 {
424 case BOOT_OVERALL:
425 handleBootOverallEvent(tid, sensorId, presentReading);
426 break;
427 case PCIE_HOT_PLUG:
428 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
429 break;
430 case DDR_STATUS:
431 handleDDRStatusEvent(tid, sensorId, presentReading);
432 break;
433 case PCP_VR_STATE:
434 case SOC_VR_STATE:
435 case DPHY_VR1_STATE:
436 case DPHY_VR2_STATE:
437 case D2D_VR_STATE:
438 case IOC_VR1_STATE:
439 case IOC_VR2_STATE:
440 case PCI_D_VR_STATE:
441 case PCI_A_VR_STATE:
442 handleVRDStatusEvent(tid, sensorId, presentReading);
443 break;
444 case WATCH_DOG:
445 handleNumericWatchdogEvent(tid, sensorId, presentReading);
446 break;
447 default:
448 std::string description;
449 std::stringstream strStream;
450
451 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
452 description += prefixMsgStrCreation(tid, sensorId);
453 strStream << std::setfill('0') << std::hex << "eventState 0x"
454 << std::setw(2) << static_cast<uint32_t>(eventState)
455 << " previousEventState 0x" << std::setw(2)
456 << static_cast<uint32_t>(previousEventState)
457 << " sensorDataSize 0x" << std::setw(2)
458 << static_cast<uint32_t>(sensorDataSize)
459 << " presentReading 0x" << std::setw(8)
460 << static_cast<uint32_t>(presentReading) << std::dec;
461 description += strStream.str();
462 std::cout << description << "\n";
463 }
464 return PLDM_SUCCESS;
465 }
466
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)467 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
468 const uint8_t* sensorData,
469 size_t sensorDataLength)
470 {
471 uint8_t sensorOffset = 0;
472 uint8_t eventState = 0;
473 uint8_t previousEventState = 0;
474
475 auto rc =
476 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
477 &eventState, &previousEventState);
478 if (rc)
479 {
480 lg2::error(
481 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
482 "TID", tid, "RC", rc);
483 return rc;
484 }
485
486 std::string description;
487 std::string source = prefixMsgStrCreation(tid, sensorId);
488
489 if (stateSensorToMsgMap.contains(sensorId))
490 {
491 log_level logLevel = log_level::OK;
492
493 auto componentMap = stateSensorToMsgMap[sensorId];
494 if (sensorOffset < componentMap.size())
495 {
496 description += std::get<0>(componentMap[sensorOffset]);
497 auto stateMap = std::get<1>(componentMap[sensorOffset]);
498 if (stateMap.contains(eventState))
499 {
500 logLevel = std::get<0>(stateMap[eventState]);
501 description += " state : " + std::get<1>(stateMap[eventState]);
502 if (stateMap.contains(previousEventState))
503 {
504 description += "; previous state: " +
505 std::get<1>(stateMap[previousEventState]);
506 }
507 }
508 else
509 {
510 description += " sends unsupported event state: " +
511 std::to_string(eventState);
512 if (stateMap.contains(previousEventState))
513 {
514 description += "; previous state: " +
515 std::get<1>(stateMap[previousEventState]);
516 }
517 }
518 }
519 else
520 {
521 description += "sends unsupported component sensor offset " +
522 std::to_string(sensorOffset);
523 }
524
525 sendJournalRedfish(source, description, logLevel);
526 }
527 else
528 {
529 std::stringstream strStream;
530 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
531 description += prefixMsgStrCreation(tid, sensorId);
532 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
533 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
534 << "eventState 0x" << std::setw(2)
535 << static_cast<uint32_t>(eventState)
536 << " previousEventState 0x" << std::setw(2)
537 << static_cast<uint32_t>(previousEventState) << std::dec;
538 description += strStream.str();
539 std::cout << description << "\n";
540 }
541
542 return PLDM_SUCCESS;
543 }
544
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)545 int OemEventManager::processSensorOpStateEvent(
546 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
547 size_t sensorDataLength)
548 {
549 uint8_t present_op_state = 0;
550 uint8_t previous_op_state = 0;
551
552 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
553 &present_op_state, &previous_op_state);
554 if (rc)
555 {
556 lg2::error(
557 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
558 "TID", tid, "RC", rc);
559 return rc;
560 }
561
562 std::string description;
563 std::stringstream strStream;
564
565 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
566 description += prefixMsgStrCreation(tid, sensorId);
567 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
568 << std::setw(2) << static_cast<uint32_t>(present_op_state)
569 << "previous_op_state 0x" << std::setw(2)
570 << static_cast<uint32_t>(previous_op_state) << std::dec;
571 description += strStream.str();
572 std::cout << description << "\n";
573
574 return PLDM_SUCCESS;
575 }
576
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)577 int OemEventManager::handleSensorEvent(
578 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
579 pldm_tid_t tid, size_t eventDataOffset)
580 {
581 /* This OEM event handler is only used for SoC terminus*/
582 if (!tidToSocketNameMap.contains(tid))
583 {
584 return PLDM_SUCCESS;
585 }
586 auto eventData =
587 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
588 auto eventDataSize = payloadLength - eventDataOffset;
589
590 uint16_t sensorId = 0;
591 uint8_t sensorEventClassType = 0;
592 size_t eventClassDataOffset = 0;
593 auto rc =
594 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
595 &sensorEventClassType, &eventClassDataOffset);
596 if (rc)
597 {
598 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
599 rc);
600 return rc;
601 }
602 const uint8_t* sensorData = eventData + eventClassDataOffset;
603 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
604
605 switch (sensorEventClassType)
606 {
607 case PLDM_NUMERIC_SENSOR_STATE:
608 {
609 return processNumericSensorEvent(tid, sensorId, sensorData,
610 sensorDataLength);
611 }
612 case PLDM_STATE_SENSOR_STATE:
613 {
614 return processStateSensorEvent(tid, sensorId, sensorData,
615 sensorDataLength);
616 }
617 case PLDM_SENSOR_OP_STATE:
618 {
619 return processSensorOpStateEvent(tid, sensorId, sensorData,
620 sensorDataLength);
621 }
622 default:
623 std::string description;
624 std::stringstream strStream;
625
626 description += "SENSOR_EVENT : Unsupported Sensor Class " +
627 std::to_string(sensorEventClassType) + ": ";
628 description += prefixMsgStrCreation(tid, sensorId);
629 strStream << std::setfill('0') << std::hex
630 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
631
632 auto dataPtr = sensorData;
633 for ([[maybe_unused]] const auto& i :
634 std::views::iota(0, (int)sensorDataLength))
635 {
636 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
637 dataPtr += sizeof(sensorData);
638 }
639
640 description += strStream.str();
641 std::cout << description << "\n";
642 }
643
644 return PLDM_ERROR;
645 }
646
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)647 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
648 uint32_t presentReading)
649 {
650 std::string description;
651 std::string source;
652 std::stringstream strStream;
653 PCIeHotPlugEventRecord_t record{presentReading};
654
655 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
656 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
657 log_level logLevel =
658 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
659
660 source = prefixMsgStrCreation(tid, sensorId);
661
662 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
663 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
664 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
665 << "); Device (0x" << std::setw(2)
666 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
667 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
668 << "); Action (" << sAction << "); Operation status ("
669 << sOpStatus << "); Media slot number (" << std::dec
670 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
671
672 description += strStream.str();
673
674 // Log to Redfish event
675 sendJournalRedfish(source, description, logLevel);
676 }
677
dimmTrainingFailureToMsg(uint32_t failureInfo)678 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
679 {
680 std::string description;
681 DIMMTrainingFailure_t failure{failureInfo};
682
683 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
684 {
685 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
686
687 description += std::get<0>(failureInfoMap);
688
689 description += "; MCU rank index " +
690 std::to_string(failure.bits.mcuRankIdx);
691
692 description += "; Slice number " +
693 std::to_string(failure.bits.sliceNum);
694
695 description += "; Upper nibble error status: ";
696 description += (!failure.bits.upperNibbStatErr)
697 ? "No error"
698 : "Found no rising edge";
699
700 description += "; Lower nibble error status: ";
701 description += (!failure.bits.lowerNibbStatErr)
702 ? "No error"
703 : "Found no rising edge";
704
705 description += "; Failure syndrome 0: ";
706
707 auto& syndromeMap = std::get<1>(failureInfoMap);
708 if (syndromeMap.contains(failure.bits.syndrome))
709 {
710 description += syndromeMap[failure.bits.syndrome];
711 }
712 else
713 {
714 description += "(Unknown syndrome)";
715 }
716 }
717 else
718 {
719 description += "Unknown training failure type " +
720 std::to_string(failure.bits.type);
721 }
722
723 return description;
724 }
725
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)726 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
727 uint32_t presentReading)
728 {
729 log_level logLevel{log_level::WARNING};
730 std::string description;
731 std::string source;
732 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
733 uint32_t byte012 = presentReading & 0xffffff;
734
735 source = prefixMsgStrCreation(tid, sensorId);
736
737 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
738 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
739 if (dimmIdx >= maxDIMMIdxBitNum)
740 {
741 return;
742 }
743
744 description += "DIMM " + std::to_string(dimmIdx) + " ";
745
746 if (dimmStatusToMsgMap.contains(byte3))
747 {
748 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
749 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
750 {
751 logLevel = log_level::OK;
752 }
753
754 description += dimmStatusToMsgMap[byte3];
755
756 if (byte3 == dimm_status::TRAINING_FAILURE)
757 {
758 description += "; " + dimmTrainingFailureToMsg(byte012);
759 }
760 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
761 {
762 uint8_t byte0 = (byte012 & 0xff);
763 if (byte0 < pmicTempAlertMsg.size())
764 {
765 description += ": " + pmicTempAlertMsg[byte0];
766 }
767 }
768 }
769 else
770 {
771 switch (byte3)
772 {
773 case dimm_status::PMIC_HIGH_TEMP:
774 if (byte012 == 0x01)
775 {
776 description += "has PMIC high temp condition";
777 }
778 break;
779 case dimm_status::TSx_HIGH_TEMP:
780 switch (byte012)
781 {
782 case 0x01:
783 description += "has TS0";
784 break;
785 case 0x02:
786 description += "has TS1";
787 break;
788 case 0x03:
789 description += "has TS0 and TS1";
790 break;
791 }
792 description += " exceeding their high temperature threshold";
793 break;
794 case dimm_status::SPD_HUB_HIGH_TEMP:
795 if (byte012 == 0x01)
796 {
797 description += "has SPD/HUB high temp condition";
798 }
799 break;
800 default:
801 description += "has unsupported status " +
802 std::to_string(byte3);
803 break;
804 }
805 }
806
807 // Log to Redfish event
808 sendJournalRedfish(source, description, logLevel);
809 }
810
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)811 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
812 uint32_t presentReading)
813 {
814 log_level logLevel{log_level::WARNING};
815 std::string description;
816 std::string source;
817 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
818 uint32_t byte012 = presentReading & 0xffffff;
819
820 source = prefixMsgStrCreation(tid, sensorId);
821
822 description += "DDR ";
823 if (ddrStatusToMsgMap.contains(byte3))
824 {
825 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
826 {
827 logLevel = log_level::OK;
828 }
829
830 description += ddrStatusToMsgMap[byte3];
831
832 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
833 byte3 == ddr_status::TRAINING_FAILURE)
834 {
835 // List out failed DIMMs
836 description += dimmIdxsToString(byte012);
837 }
838 }
839 else
840 {
841 description += "has unsupported status " + std::to_string(byte3);
842 }
843
844 // Log to Redfish event
845 sendJournalRedfish(source, description, logLevel);
846 }
847
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)848 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
849 uint32_t presentReading)
850 {
851 log_level logLevel{log_level::WARNING};
852 std::string description;
853 std::string source;
854 std::stringstream strStream;
855
856 source = prefixMsgStrCreation(tid, sensorId);
857
858 VRDStatus_t status{presentReading};
859
860 if (status.bits.warning && status.bits.critical)
861 {
862 description += "A VR warning and a VR critical";
863 logLevel = log_level::CRITICAL;
864 }
865 else
866 {
867 if (status.bits.warning)
868 {
869 description += "A VR warning";
870 }
871 else if (status.bits.critical)
872 {
873 description += "A VR critical";
874 logLevel = log_level::CRITICAL;
875 }
876 else
877 {
878 description += "No VR warning or critical";
879 logLevel = log_level::OK;
880 }
881 }
882 description += " condition observed";
883
884 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
885 << std::setw(2)
886 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
887 << "; VR status byte low is 0x" << std::setw(2)
888 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
889 << "; Reading is 0x" << std::setw(2)
890 << static_cast<uint32_t>(presentReading) << ";";
891
892 description += strStream.str();
893
894 // Log to Redfish event
895 sendJournalRedfish(source, description, logLevel);
896 }
897
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)898 void OemEventManager::handleNumericWatchdogEvent(
899 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
900 {
901 std::string description;
902 std::string source;
903 log_level logLevel = log_level::CRITICAL;
904
905 source = prefixMsgStrCreation(tid, sensorId);
906
907 if (presentReading & 0x01)
908 {
909 description += "Global watchdog expired;";
910 }
911 if (presentReading & 0x02)
912 {
913 description += "Secure watchdog expired;";
914 }
915 if (presentReading & 0x04)
916 {
917 description += "Non-secure watchdog expired;";
918 }
919
920 // Log to Redfish event
921 sendJournalRedfish(source, description, logLevel);
922 }
923
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)924 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
925 const uint8_t* eventData,
926 size_t eventDataSize)
927 {
928 EFI_AMPERE_ERROR_DATA ampHdr;
929
930 decodeCperRecord(eventData, eventDataSize, &Hdr);
931
932 addCperSELLog(tid, eventId, &Hdr);
933
934 /* isBert at bit 12 of TypeId */
935 if (ampHdr.TypeId & 0x0800)
936 {
937 lg2::info("Ampere SoC BERT is triggered.");
938 std::variant<std::string> value(
939 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
940 try
941 {
942 auto& bus = pldm::utils::DBusHandler::getBus();
943 auto method =
944 bus.new_method_call("com.ampere.CrashCapture.Trigger",
945 "/com/ampere/crashcapture/trigger",
946 pldm::utils::dbusProperties, "Set");
947 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
948 value);
949 bus.call_noreply(method);
950 }
951 catch (const std::exception& e)
952 {
953 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
954 }
955 }
956
957 return PLDM_SUCCESS;
958 }
959
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)960 int OemEventManager::handlepldmMessagePollEvent(
961 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
962 pldm_tid_t tid, size_t eventDataOffset)
963 {
964 /* This OEM event handler is only used for SoC terminus*/
965 if (!tidToSocketNameMap.contains(tid))
966 {
967 return PLDM_SUCCESS;
968 }
969
970 auto eventData =
971 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
972 auto eventDataSize = payloadLength - eventDataOffset;
973
974 pldm_message_poll_event poll_event{};
975 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
976 &poll_event);
977 if (rc)
978 {
979 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
980 "RC", rc);
981 return rc;
982 }
983
984 auto sensorID = poll_event.event_id;
985 /* The UE errors */
986 if (rasUESensorIDs.contains(sensorID))
987 {
988 pldm::utils::DBusMapping dbusMapping{
989 "/xyz/openbmc_project/led/groups/ras_ue_fault",
990 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
991 try
992 {
993 pldm::utils::DBusHandler().setDbusProperty(
994 dbusMapping, pldm::utils::PropertyValue{bool(true)});
995 }
996 catch (const std::exception& e)
997 {
998 lg2::error(
999 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
1000 "TID", tid, "SENSORID", sensorID, "ERROR", e);
1001 }
1002 }
1003
1004 return PLDM_SUCCESS;
1005 }
1006
oemPollForPlatformEvent(pldm_tid_t tid)1007 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
1008 {
1009 uint64_t t0 = 0;
1010
1011 /* This OEM event handler is only used for SoC terminus */
1012 if (!tidToSocketNameMap.contains(tid))
1013 {
1014 co_return PLDM_SUCCESS;
1015 }
1016
1017 if (!timeStampMap.contains(tid))
1018 {
1019 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1020 timeStampMap.emplace(std::make_pair(tid, t0));
1021 }
1022 else
1023 {
1024 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1025 uint64_t elapsed = t0 - timeStampMap[tid];
1026 if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1027 {
1028 co_await manager->pollForPlatformEvent(tid, 0, 0);
1029 timeStampMap[tid] = t0;
1030 }
1031 }
1032
1033 co_return PLDM_SUCCESS;
1034 }
1035 } // namespace oem_ampere
1036 } // namespace pldm
1037