1 #include "oem_event_manager.hpp"
2
3 #include "libcper/Cper.h"
4
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13
14 #include <com/ampere/Event/ReportedSEL/event.hpp>
15 #include <phosphor-logging/commit.hpp>
16 #include <phosphor-logging/lg2.hpp>
17 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
18
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <sstream>
23 #include <string>
24 #include <unordered_map>
25
26 namespace pldm
27 {
28 namespace oem_ampere
29 {
30 namespace fs = std::filesystem;
31 using namespace std::chrono;
32 namespace ReportedErrorSEL = sdbusplus::error::com::ampere::event::ReportedSEL;
33 namespace ReportedEventSEL = sdbusplus::event::com::ampere::event::ReportedSEL;
34
35 namespace boot_stage = boot::stage;
36 namespace ddr_status = ddr::status;
37 namespace dimm_status = dimm::status;
38 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
39 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
40 namespace training_failure = dimm::training_failure;
41
42 constexpr const char* BIOSFWPanicRegistry =
43 "OpenBMC.0.1.BIOSFirmwarePanicReason";
44 constexpr auto maxDIMMIdxBitNum = 24;
45 constexpr auto maxDIMMInstantNum = 24;
46
47 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
48
49 /*
50 An array of possible boot status of a boot stage.
51 The index maps with byte 0 of boot code.
52 */
53 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
54
55 /*
56 An array of possible boot status of DDR training stage.
57 The index maps with byte 0 of boot code.
58 */
59 std::array<std::string, 3> ddrTrainingMsg = {
60 " progress started", " in-progress", " progress completed"};
61
62 /*
63 A map between PMIC status and logging strings.
64 */
65 std::array<std::string, 8> pmicTempAlertMsg = {
66 "Below 85°C", "85°C", "95°C", "105°C",
67 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
68
69 /*
70 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
71 EPs through SMBus and PCIe. When host boots up, SMBUS interface
72 comes up first. In this interface, BMC is bus owner.
73
74 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
75 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
76 */
77 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
78
79 /*
80 A map between sensor IDs and their names in string.
81 Using pldm::oem::sensor_ids
82 */
83 EventToMsgMap_t sensorIdToStrMap = {
84 {DDR_STATUS, "DDR_STATUS"},
85 {PCP_VR_STATE, "PCP_VR_STATE"},
86 {SOC_VR_STATE, "SOC_VR_STATE"},
87 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
88 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
89 {D2D_VR_STATE, "D2D_VR_STATE"},
90 {IOC_VR1_STATE, "IOC_VR1_STATE"},
91 {IOC_VR2_STATE, "IOC_VR2_STATE"},
92 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
93 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
94 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
95 {BOOT_OVERALL, "BOOT_OVERALL"},
96 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
97 {WATCH_DOG, "WATCH_DOG"}};
98
99 /*
100 A map between the boot stages and logging strings.
101 Using pldm::oem::boot::stage::boot_stage
102 */
103 EventToMsgMap_t bootStageToMsgMap = {
104 {boot_stage::SECPRO, "SECpro"},
105 {boot_stage::MPRO, "Mpro"},
106 {boot_stage::ATF_BL1, "ATF BL1"},
107 {boot_stage::ATF_BL2, "ATF BL2"},
108 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
109 {boot_stage::DDR_TRAINING, "DDR training"},
110 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
111 {boot_stage::ATF_BL31, "ATF BL31"},
112 {boot_stage::ATF_BL32, "ATF BL32"},
113 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
114 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
115 "ATF BL33 (UEFI) booting status = "}};
116
117 /*
118 A map between DDR status and logging strings.
119 Using pldm::oem::ddr::status::ddr_status
120 */
121 EventToMsgMap_t ddrStatusToMsgMap = {
122 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
123 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
124 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
125 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
126 {ddr_status::OTHER_FAILURE, "has other failure"},
127 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
128 "has boot failure due to no configuration"},
129 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
130 "failsafe activated but boot success with the next valid configuration"}};
131
132 /*
133 A map between DIMM status and logging strings.
134 Using pldm::oem::dimm::status::dimm_status
135 */
136 EventToMsgMap_t dimmStatusToMsgMap = {
137 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
138 {dimm_status::NOT_INSTALLED, "is not installed"},
139 {dimm_status::OTHER_FAILURE, "has other failure"},
140 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
141 {dimm_status::TRAINING_FAILURE, "has training failure; "},
142 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
143
144 /*
145 A map between PHY training failure syndrome and logging strings.
146 Using
147 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
148 */
149 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
150 {phy_syndrome::NA, "(N/A)"},
151 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
152 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
153 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
154 "(PHY write level failure - see syndrome 1)"},
155 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
156 "(PHY read gate leveling failure)"},
157 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
158 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
159 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
160
161 /*
162 A map between DIMM training failure syndrome and logging strings.
163 Using
164 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
165 */
166 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
167 {dimm_syndrome::NA, "(N/A)"},
168 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
169 "(DRAM VREFDQ training failure)"},
170 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
171 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
172 "(LRDRIMM DB SW training failure)"}};
173
174 /*
175 A map between DIMM training failure type and a pair of <logging strings -
176 syndrome map>. Using
177 pldm::oem::dimm::training_faillure::dimm_training_failure_type
178 */
179 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
180 dimmTrainingFailureTypeMap = {
181 {training_failure::PHY_TRAINING_FAILURE_TYPE,
182 std::make_pair("PHY training failure",
183 phyTrainingFailureSyndromeToMsgMap)},
184 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
185 std::make_pair("DIMM training failure",
186 dimmTrainingFailureSyndromeToMsgMap)}};
187
188 std::unordered_map<
189 uint16_t,
190 std::vector<std::pair<
191 std::string,
192 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
193 stateSensorToMsgMap = {
194 {SOC_HEALTH_AVAILABILITY,
195 {{"SoC Health",
196 {{1, {log_level::OK, "Normal"}},
197 {2, {log_level::WARNING, "Non-Critical"}},
198 {3, {log_level::CRITICAL, "Critical"}},
199 {4, {log_level::CRITICAL, "Fatal"}}}},
200 {"SoC Availability",
201 {{1, {log_level::OK, "Enabled"}},
202 {2, {log_level::WARNING, "Disabled"}},
203 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
204 {WATCH_DOG,
205 {{"Global Watch Dog",
206 {{1, {log_level::OK, "Normal"}},
207 {2, {log_level::CRITICAL, "Timer Expired"}}}},
208 {"Secure Watch Dog",
209 {{1, {log_level::OK, "Normal"}},
210 {2, {log_level::CRITICAL, "Timer Expired"}}}},
211 {"Non-secure Watch Dog",
212 {{1, {log_level::OK, "Normal"}},
213 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
214
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)215 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
216 uint16_t sensorId)
217 {
218 std::string description;
219
220 if (!sensorIdToStrMap.contains(sensorId))
221 {
222 description += "Sensor ID " + std::to_string(sensorId) + " of ";
223 }
224 else
225 {
226 description += "Sensor " + sensorIdToStrMap[sensorId] + " of ";
227 }
228
229 if (!tidToSocketNameMap.contains(tid))
230 {
231 description += "TID " + std::to_string(tid);
232 }
233 else
234 {
235 description += tidToSocketNameMap[tid];
236 }
237
238 return description;
239 }
240
sendJournalRedfish(const std::string & source,const std::string & description,log_level & logLevel)241 void OemEventManager::sendJournalRedfish(const std::string& source,
242 const std::string& description,
243 log_level& logLevel)
244 {
245 if (description.empty())
246 {
247 return;
248 }
249
250 switch (logLevel)
251 {
252 case log_level::OK:
253 lg2::commit(ReportedEventSEL::ReportedSELInfo(
254 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
255 break;
256 case log_level::WARNING:
257 lg2::commit(ReportedErrorSEL::ReportedSELWarning(
258 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
259 break;
260 case log_level::CRITICAL:
261 lg2::commit(ReportedErrorSEL::ReportedSELCritical(
262 "SOURCE", source, "MESSAGE", description, "RAW_DATA", ""));
263 break;
264 case log_level::BIOSFWPANIC:
265 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
266 BIOSFWPanicRegistry, "REDFISH_MESSAGE_ARGS", description);
267 break;
268 default:
269 {
270 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
271 "DES", description);
272 return;
273 }
274 }
275 }
276
dimmIdxsToString(uint32_t dimmIdxs)277 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
278 {
279 std::string description;
280 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
281 {
282 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
283 {
284 description += " #" + std::to_string(bitIdx);
285 }
286 }
287 return description;
288 }
289
sensorIdToDIMMIdx(const uint16_t & sensorId)290 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
291 {
292 uint8_t dimmIdx = maxDIMMInstantNum;
293 int sensorId_Off = sensorId - 4;
294 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
295 ((sensorId_Off / 2) < maxDIMMInstantNum))
296 {
297 dimmIdx = sensorId_Off / 2;
298 }
299 return dimmIdx;
300 }
301
handleBootOverallEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)302 void OemEventManager::handleBootOverallEvent(pldm_tid_t tid, uint16_t sensorId,
303 uint32_t presentReading)
304 {
305 log_level logLevel{log_level::OK};
306 std::string description;
307 std::string source;
308 std::stringstream strStream;
309
310 uint8_t byte0 = (presentReading & 0x000000ff);
311 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
312 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
313 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
314 /*
315 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
316 * ATF BL32 and DDR initialization
317 */
318 if (bootStageToMsgMap.contains(byte3))
319 {
320 // Boot stage adding
321 description += bootStageToMsgMap[byte3];
322
323 switch (byte3)
324 {
325 case boot_stage::DDR_TRAINING:
326 if (byte0 >= ddrTrainingMsg.size())
327 {
328 logLevel = log_level::BIOSFWPANIC;
329 description += " unknown status";
330 }
331 else
332 {
333 description += ddrTrainingMsg[byte0];
334 }
335 if (0x01 == byte0)
336 {
337 // Add complete percentage
338 description += " at " + std::to_string(byte1) + "%";
339 }
340 break;
341 case boot_stage::S0_DDR_TRAINING_FAILURE:
342 case boot_stage::S1_DDR_TRAINING_FAILURE:
343 // ddr_training_status_msg()
344 logLevel = log_level::BIOSFWPANIC;
345 description += " at DIMMs:";
346 // dimmIdxs = presentReading & 0x00ffffff;
347 description += dimmIdxsToString(presentReading & 0x00ffffff);
348 description += " of socket ";
349 description +=
350 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
351 break;
352 default:
353 if (byte0 >= bootStatMsg.size())
354 {
355 logLevel = log_level::BIOSFWPANIC;
356 description += " unknown status";
357 }
358 else
359 {
360 description += bootStatMsg[byte0];
361 }
362 break;
363 }
364
365 // Sensor report action is fail
366 if (boot::status::BOOT_STATUS_FAILURE == byte2)
367 {
368 logLevel = log_level::BIOSFWPANIC;
369 }
370 }
371 else
372 {
373 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
374 {
375 description +=
376 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
377
378 strStream
379 << "Segment (0x" << std::setfill('0') << std::hex
380 << std::setw(8) << static_cast<uint32_t>(presentReading)
381 << "); Status Class (0x" << std::setw(2)
382 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
383 << std::setw(2) << static_cast<uint32_t>(byte2)
384 << "); Operation Code (0x" << std::setw(4)
385 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
386 << ")" << std::dec;
387
388 description += strStream.str();
389 }
390 }
391
392 source = prefixMsgStrCreation(tid, sensorId);
393 // Log to Redfish event
394 sendJournalRedfish(source, description, logLevel);
395 }
396
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)397 int OemEventManager::processNumericSensorEvent(
398 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
399 size_t sensorDataLength)
400 {
401 uint8_t eventState = 0;
402 uint8_t previousEventState = 0;
403 uint8_t sensorDataSize = 0;
404 uint32_t presentReading;
405 auto rc = decode_numeric_sensor_data(
406 sensorData, sensorDataLength, &eventState, &previousEventState,
407 &sensorDataSize, &presentReading);
408 if (rc)
409 {
410 lg2::error(
411 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
412 "TID", tid, "RC", rc);
413 return rc;
414 }
415
416 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
417 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
418 {
419 handleDIMMStatusEvent(tid, sensorId, presentReading);
420 return PLDM_SUCCESS;
421 }
422
423 switch (sensorId)
424 {
425 case BOOT_OVERALL:
426 handleBootOverallEvent(tid, sensorId, presentReading);
427 break;
428 case PCIE_HOT_PLUG:
429 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
430 break;
431 case DDR_STATUS:
432 handleDDRStatusEvent(tid, sensorId, presentReading);
433 break;
434 case PCP_VR_STATE:
435 case SOC_VR_STATE:
436 case DPHY_VR1_STATE:
437 case DPHY_VR2_STATE:
438 case D2D_VR_STATE:
439 case IOC_VR1_STATE:
440 case IOC_VR2_STATE:
441 case PCI_D_VR_STATE:
442 case PCI_A_VR_STATE:
443 handleVRDStatusEvent(tid, sensorId, presentReading);
444 break;
445 case WATCH_DOG:
446 handleNumericWatchdogEvent(tid, sensorId, presentReading);
447 break;
448 default:
449 std::string description;
450 std::stringstream strStream;
451
452 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
453 description += prefixMsgStrCreation(tid, sensorId);
454 strStream << std::setfill('0') << std::hex << "eventState 0x"
455 << std::setw(2) << static_cast<uint32_t>(eventState)
456 << " previousEventState 0x" << std::setw(2)
457 << static_cast<uint32_t>(previousEventState)
458 << " sensorDataSize 0x" << std::setw(2)
459 << static_cast<uint32_t>(sensorDataSize)
460 << " presentReading 0x" << std::setw(8)
461 << static_cast<uint32_t>(presentReading) << std::dec;
462 description += strStream.str();
463 std::cout << description << "\n";
464 }
465 return PLDM_SUCCESS;
466 }
467
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)468 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
469 const uint8_t* sensorData,
470 size_t sensorDataLength)
471 {
472 uint8_t sensorOffset = 0;
473 uint8_t eventState = 0;
474 uint8_t previousEventState = 0;
475
476 auto rc =
477 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
478 &eventState, &previousEventState);
479 if (rc)
480 {
481 lg2::error(
482 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
483 "TID", tid, "RC", rc);
484 return rc;
485 }
486
487 std::string description;
488 std::string source = prefixMsgStrCreation(tid, sensorId);
489
490 if (stateSensorToMsgMap.contains(sensorId))
491 {
492 log_level logLevel = log_level::OK;
493
494 auto componentMap = stateSensorToMsgMap[sensorId];
495 if (sensorOffset < componentMap.size())
496 {
497 description += std::get<0>(componentMap[sensorOffset]);
498 auto stateMap = std::get<1>(componentMap[sensorOffset]);
499 if (stateMap.contains(eventState))
500 {
501 logLevel = std::get<0>(stateMap[eventState]);
502 description += " state : " + std::get<1>(stateMap[eventState]);
503 if (stateMap.contains(previousEventState))
504 {
505 description += "; previous state: " +
506 std::get<1>(stateMap[previousEventState]);
507 }
508 }
509 else
510 {
511 description += " sends unsupported event state: " +
512 std::to_string(eventState);
513 if (stateMap.contains(previousEventState))
514 {
515 description += "; previous state: " +
516 std::get<1>(stateMap[previousEventState]);
517 }
518 }
519 }
520 else
521 {
522 description += "sends unsupported component sensor offset " +
523 std::to_string(sensorOffset);
524 }
525
526 sendJournalRedfish(source, description, logLevel);
527 }
528 else
529 {
530 std::stringstream strStream;
531 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
532 description += prefixMsgStrCreation(tid, sensorId);
533 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
534 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
535 << "eventState 0x" << std::setw(2)
536 << static_cast<uint32_t>(eventState)
537 << " previousEventState 0x" << std::setw(2)
538 << static_cast<uint32_t>(previousEventState) << std::dec;
539 description += strStream.str();
540 std::cout << description << "\n";
541 }
542
543 return PLDM_SUCCESS;
544 }
545
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)546 int OemEventManager::processSensorOpStateEvent(
547 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
548 size_t sensorDataLength)
549 {
550 uint8_t present_op_state = 0;
551 uint8_t previous_op_state = 0;
552
553 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
554 &present_op_state, &previous_op_state);
555 if (rc)
556 {
557 lg2::error(
558 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
559 "TID", tid, "RC", rc);
560 return rc;
561 }
562
563 std::string description;
564 std::stringstream strStream;
565
566 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
567 description += prefixMsgStrCreation(tid, sensorId);
568 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
569 << std::setw(2) << static_cast<uint32_t>(present_op_state)
570 << "previous_op_state 0x" << std::setw(2)
571 << static_cast<uint32_t>(previous_op_state) << std::dec;
572 description += strStream.str();
573 std::cout << description << "\n";
574
575 return PLDM_SUCCESS;
576 }
577
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)578 int OemEventManager::handleSensorEvent(
579 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
580 pldm_tid_t tid, size_t eventDataOffset)
581 {
582 /* This OEM event handler is only used for SoC terminus*/
583 if (!tidToSocketNameMap.contains(tid))
584 {
585 return PLDM_SUCCESS;
586 }
587 auto eventData =
588 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
589 auto eventDataSize = payloadLength - eventDataOffset;
590
591 uint16_t sensorId = 0;
592 uint8_t sensorEventClassType = 0;
593 size_t eventClassDataOffset = 0;
594 auto rc =
595 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
596 &sensorEventClassType, &eventClassDataOffset);
597 if (rc)
598 {
599 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
600 rc);
601 return rc;
602 }
603 const uint8_t* sensorData = eventData + eventClassDataOffset;
604 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
605
606 switch (sensorEventClassType)
607 {
608 case PLDM_NUMERIC_SENSOR_STATE:
609 {
610 return processNumericSensorEvent(tid, sensorId, sensorData,
611 sensorDataLength);
612 }
613 case PLDM_STATE_SENSOR_STATE:
614 {
615 return processStateSensorEvent(tid, sensorId, sensorData,
616 sensorDataLength);
617 }
618 case PLDM_SENSOR_OP_STATE:
619 {
620 return processSensorOpStateEvent(tid, sensorId, sensorData,
621 sensorDataLength);
622 }
623 default:
624 std::string description;
625 std::stringstream strStream;
626
627 description += "SENSOR_EVENT : Unsupported Sensor Class " +
628 std::to_string(sensorEventClassType) + ": ";
629 description += prefixMsgStrCreation(tid, sensorId);
630 strStream << std::setfill('0') << std::hex
631 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
632
633 auto dataPtr = sensorData;
634 for ([[maybe_unused]] const auto& i :
635 std::views::iota(0, (int)sensorDataLength))
636 {
637 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
638 dataPtr += sizeof(sensorData);
639 }
640
641 description += strStream.str();
642 std::cout << description << "\n";
643 }
644
645 return PLDM_ERROR;
646 }
647
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)648 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
649 uint32_t presentReading)
650 {
651 std::string description;
652 std::string source;
653 std::stringstream strStream;
654 PCIeHotPlugEventRecord_t record{presentReading};
655
656 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
657 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
658 log_level logLevel =
659 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
660
661 source = prefixMsgStrCreation(tid, sensorId);
662
663 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
664 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
665 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
666 << "); Device (0x" << std::setw(2)
667 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
668 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
669 << "); Action (" << sAction << "); Operation status ("
670 << sOpStatus << "); Media slot number (" << std::dec
671 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
672
673 description += strStream.str();
674
675 // Log to Redfish event
676 sendJournalRedfish(source, description, logLevel);
677 }
678
dimmTrainingFailureToMsg(uint32_t failureInfo)679 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
680 {
681 std::string description;
682 DIMMTrainingFailure_t failure{failureInfo};
683
684 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
685 {
686 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
687
688 description += std::get<0>(failureInfoMap);
689
690 description += "; MCU rank index " +
691 std::to_string(failure.bits.mcuRankIdx);
692
693 description += "; Slice number " +
694 std::to_string(failure.bits.sliceNum);
695
696 description += "; Upper nibble error status: ";
697 description += (!failure.bits.upperNibbStatErr)
698 ? "No error"
699 : "Found no rising edge";
700
701 description += "; Lower nibble error status: ";
702 description += (!failure.bits.lowerNibbStatErr)
703 ? "No error"
704 : "Found no rising edge";
705
706 description += "; Failure syndrome 0: ";
707
708 auto& syndromeMap = std::get<1>(failureInfoMap);
709 if (syndromeMap.contains(failure.bits.syndrome))
710 {
711 description += syndromeMap[failure.bits.syndrome];
712 }
713 else
714 {
715 description += "(Unknown syndrome)";
716 }
717 }
718 else
719 {
720 description += "Unknown training failure type " +
721 std::to_string(failure.bits.type);
722 }
723
724 return description;
725 }
726
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)727 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
728 uint32_t presentReading)
729 {
730 log_level logLevel{log_level::WARNING};
731 std::string description;
732 std::string source;
733 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
734 uint32_t byte012 = presentReading & 0xffffff;
735
736 source = prefixMsgStrCreation(tid, sensorId);
737
738 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
739 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
740 if (dimmIdx >= maxDIMMIdxBitNum)
741 {
742 return;
743 }
744
745 description += "DIMM " + std::to_string(dimmIdx) + " ";
746
747 if (dimmStatusToMsgMap.contains(byte3))
748 {
749 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
750 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
751 {
752 logLevel = log_level::OK;
753 }
754
755 description += dimmStatusToMsgMap[byte3];
756
757 if (byte3 == dimm_status::TRAINING_FAILURE)
758 {
759 description += "; " + dimmTrainingFailureToMsg(byte012);
760 }
761 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
762 {
763 uint8_t byte0 = (byte012 & 0xff);
764 if (byte0 < pmicTempAlertMsg.size())
765 {
766 description += ": " + pmicTempAlertMsg[byte0];
767 }
768 }
769 }
770 else
771 {
772 switch (byte3)
773 {
774 case dimm_status::PMIC_HIGH_TEMP:
775 if (byte012 == 0x01)
776 {
777 description += "has PMIC high temp condition";
778 }
779 break;
780 case dimm_status::TSx_HIGH_TEMP:
781 switch (byte012)
782 {
783 case 0x01:
784 description += "has TS0";
785 break;
786 case 0x02:
787 description += "has TS1";
788 break;
789 case 0x03:
790 description += "has TS0 and TS1";
791 break;
792 }
793 description += " exceeding their high temperature threshold";
794 break;
795 case dimm_status::SPD_HUB_HIGH_TEMP:
796 if (byte012 == 0x01)
797 {
798 description += "has SPD/HUB high temp condition";
799 }
800 break;
801 default:
802 description += "has unsupported status " +
803 std::to_string(byte3);
804 break;
805 }
806 }
807
808 // Log to Redfish event
809 sendJournalRedfish(source, description, logLevel);
810 }
811
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)812 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
813 uint32_t presentReading)
814 {
815 log_level logLevel{log_level::WARNING};
816 std::string description;
817 std::string source;
818 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
819 uint32_t byte012 = presentReading & 0xffffff;
820
821 source = prefixMsgStrCreation(tid, sensorId);
822
823 description += "DDR ";
824 if (ddrStatusToMsgMap.contains(byte3))
825 {
826 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
827 {
828 logLevel = log_level::OK;
829 }
830
831 description += ddrStatusToMsgMap[byte3];
832
833 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
834 byte3 == ddr_status::TRAINING_FAILURE)
835 {
836 // List out failed DIMMs
837 description += dimmIdxsToString(byte012);
838 }
839 }
840 else
841 {
842 description += "has unsupported status " + std::to_string(byte3);
843 }
844
845 // Log to Redfish event
846 sendJournalRedfish(source, description, logLevel);
847 }
848
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)849 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
850 uint32_t presentReading)
851 {
852 log_level logLevel{log_level::WARNING};
853 std::string description;
854 std::string source;
855 std::stringstream strStream;
856
857 source = prefixMsgStrCreation(tid, sensorId);
858
859 VRDStatus_t status{presentReading};
860
861 if (status.bits.warning && status.bits.critical)
862 {
863 description += "A VR warning and a VR critical";
864 logLevel = log_level::CRITICAL;
865 }
866 else
867 {
868 if (status.bits.warning)
869 {
870 description += "A VR warning";
871 }
872 else if (status.bits.critical)
873 {
874 description += "A VR critical";
875 logLevel = log_level::CRITICAL;
876 }
877 else
878 {
879 description += "No VR warning or critical";
880 logLevel = log_level::OK;
881 }
882 }
883 description += " condition observed";
884
885 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
886 << std::setw(2)
887 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
888 << "; VR status byte low is 0x" << std::setw(2)
889 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
890 << "; Reading is 0x" << std::setw(2)
891 << static_cast<uint32_t>(presentReading) << ";";
892
893 description += strStream.str();
894
895 // Log to Redfish event
896 sendJournalRedfish(source, description, logLevel);
897 }
898
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)899 void OemEventManager::handleNumericWatchdogEvent(
900 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
901 {
902 std::string description;
903 std::string source;
904 log_level logLevel = log_level::CRITICAL;
905
906 source = prefixMsgStrCreation(tid, sensorId);
907
908 if (presentReading & 0x01)
909 {
910 description += "Global watchdog expired;";
911 }
912 if (presentReading & 0x02)
913 {
914 description += "Secure watchdog expired;";
915 }
916 if (presentReading & 0x04)
917 {
918 description += "Non-secure watchdog expired;";
919 }
920
921 // Log to Redfish event
922 sendJournalRedfish(source, description, logLevel);
923 }
924
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)925 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
926 const uint8_t* eventData,
927 size_t eventDataSize)
928 {
929 EFI_AMPERE_ERROR_DATA ampHdr;
930
931 decodeCperRecord(eventData, eventDataSize, &Hdr);
932
933 addCperSELLog(tid, eventId, &Hdr);
934
935 /* isBert at bit 12 of TypeId */
936 if (ampHdr.TypeId & 0x0800)
937 {
938 lg2::info("Ampere SoC BERT is triggered.");
939 std::variant<std::string> value(
940 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
941 try
942 {
943 auto& bus = pldm::utils::DBusHandler::getBus();
944 auto method =
945 bus.new_method_call("com.ampere.CrashCapture.Trigger",
946 "/com/ampere/crashcapture/trigger",
947 pldm::utils::dbusProperties, "Set");
948 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
949 value);
950 bus.call_noreply(method);
951 }
952 catch (const std::exception& e)
953 {
954 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
955 }
956 }
957
958 return PLDM_SUCCESS;
959 }
960
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)961 int OemEventManager::handlepldmMessagePollEvent(
962 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
963 pldm_tid_t tid, size_t eventDataOffset)
964 {
965 /* This OEM event handler is only used for SoC terminus*/
966 if (!tidToSocketNameMap.contains(tid))
967 {
968 return PLDM_SUCCESS;
969 }
970
971 auto eventData =
972 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
973 auto eventDataSize = payloadLength - eventDataOffset;
974
975 pldm_message_poll_event poll_event{};
976 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
977 &poll_event);
978 if (rc)
979 {
980 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
981 "RC", rc);
982 return rc;
983 }
984
985 auto sensorID = poll_event.event_id;
986 /* The UE errors */
987 if (rasUESensorIDs.contains(sensorID))
988 {
989 pldm::utils::DBusMapping dbusMapping{
990 "/xyz/openbmc_project/led/groups/ras_ue_fault",
991 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
992 try
993 {
994 pldm::utils::DBusHandler().setDbusProperty(
995 dbusMapping, pldm::utils::PropertyValue{bool(true)});
996 }
997 catch (const std::exception& e)
998 {
999 lg2::error(
1000 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
1001 "TID", tid, "SENSORID", sensorID, "ERROR", e);
1002 }
1003 }
1004
1005 return PLDM_SUCCESS;
1006 }
1007
oemPollForPlatformEvent(pldm_tid_t tid)1008 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
1009 {
1010 uint64_t t0 = 0;
1011
1012 /* This OEM event handler is only used for SoC terminus */
1013 if (!tidToSocketNameMap.contains(tid))
1014 {
1015 co_return PLDM_SUCCESS;
1016 }
1017
1018 if (!timeStampMap.contains(tid))
1019 {
1020 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1021 timeStampMap.emplace(std::make_pair(tid, t0));
1022 }
1023 else
1024 {
1025 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1026 uint64_t elapsed = t0 - timeStampMap[tid];
1027 if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1028 {
1029 co_await manager->pollForPlatformEvent(tid, 0, 0);
1030 timeStampMap[tid] = t0;
1031 }
1032 }
1033
1034 co_return PLDM_SUCCESS;
1035 }
1036 } // namespace oem_ampere
1037 } // namespace pldm
1038