1 #include "oem_event_manager.hpp"
2
3 #include "libcper/Cper.h"
4
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace fs = std::filesystem;
29 using namespace std::chrono;
30
31 namespace boot_stage = boot::stage;
32 namespace ddr_status = ddr::status;
33 namespace dimm_status = dimm::status;
34 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
35 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
36 namespace training_failure = dimm::training_failure;
37
38 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
39 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
40 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
41 constexpr const char* BIOSFWPanicRegistry =
42 "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47
48 /*
49 An array of possible boot status of a boot stage.
50 The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53
54 /*
55 An array of possible boot status of DDR training stage.
56 The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59 " progress started", " in-progress", " progress completed"};
60
61 /*
62 A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65 "Below 85°C", "85°C", "95°C", "105°C",
66 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
67
68 /*
69 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70 EPs through SMBus and PCIe. When host boots up, SMBUS interface
71 comes up first. In this interface, BMC is bus owner.
72
73 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77
78 /*
79 A map between sensor IDs and their names in string.
80 Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83 {DDR_STATUS, "DDR_STATUS"},
84 {PCP_VR_STATE, "PCP_VR_STATE"},
85 {SOC_VR_STATE, "SOC_VR_STATE"},
86 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88 {D2D_VR_STATE, "D2D_VR_STATE"},
89 {IOC_VR1_STATE, "IOC_VR1_STATE"},
90 {IOC_VR2_STATE, "IOC_VR2_STATE"},
91 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94 {BOOT_OVERALL, "BOOT_OVERALL"},
95 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96 {WATCH_DOG, "WATCH_DOG"}};
97
98 /*
99 A map between the boot stages and logging strings.
100 Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103 {boot_stage::SECPRO, "SECpro"},
104 {boot_stage::MPRO, "Mpro"},
105 {boot_stage::ATF_BL1, "ATF BL1"},
106 {boot_stage::ATF_BL2, "ATF BL2"},
107 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108 {boot_stage::DDR_TRAINING, "DDR training"},
109 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110 {boot_stage::ATF_BL31, "ATF BL31"},
111 {boot_stage::ATF_BL32, "ATF BL32"},
112 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114 "ATF BL33 (UEFI) booting status = "}};
115
116 /*
117 A map between DDR status and logging strings.
118 Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125 {ddr_status::OTHER_FAILURE, "has other failure"},
126 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127 "has boot failure due to no configuration"},
128 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129 "failsafe activated but boot success with the next valid configuration"}};
130
131 /*
132 A map between DIMM status and logging strings.
133 Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137 {dimm_status::NOT_INSTALLED, "is not installed"},
138 {dimm_status::OTHER_FAILURE, "has other failure"},
139 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140 {dimm_status::TRAINING_FAILURE, "has training failure; "},
141 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142
143 /*
144 A map between PHY training failure syndrome and logging strings.
145 Using
146 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149 {phy_syndrome::NA, "(N/A)"},
150 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153 "(PHY write level failure - see syndrome 1)"},
154 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155 "(PHY read gate leveling failure)"},
156 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159
160 /*
161 A map between DIMM training failure syndrome and logging strings.
162 Using
163 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166 {dimm_syndrome::NA, "(N/A)"},
167 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168 "(DRAM VREFDQ training failure)"},
169 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171 "(LRDRIMM DB SW training failure)"}};
172
173 /*
174 A map between DIMM training failure type and a pair of <logging strings -
175 syndrome map>. Using
176 pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179 dimmTrainingFailureTypeMap = {
180 {training_failure::PHY_TRAINING_FAILURE_TYPE,
181 std::make_pair("PHY training failure",
182 phyTrainingFailureSyndromeToMsgMap)},
183 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184 std::make_pair("DIMM training failure",
185 dimmTrainingFailureSyndromeToMsgMap)}};
186
187 /*
188 A map between log level and the registry used for Redfish SEL log
189 Using pldm::oem::log_level
190 */
191 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
192 {log_level::OK, ampereEventRegistry},
193 {log_level::WARNING, ampereWarningRegistry},
194 {log_level::CRITICAL, ampereCriticalRegistry},
195 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
196
197 std::unordered_map<
198 uint16_t,
199 std::vector<std::pair<
200 std::string,
201 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
202 stateSensorToMsgMap = {
203 {SOC_HEALTH_AVAILABILITY,
204 {{"SoC Health",
205 {{1, {log_level::OK, "Normal"}},
206 {2, {log_level::WARNING, "Non-Critical"}},
207 {3, {log_level::CRITICAL, "Critical"}},
208 {4, {log_level::CRITICAL, "Fatal"}}}},
209 {"SoC Availability",
210 {{1, {log_level::OK, "Enabled"}},
211 {2, {log_level::WARNING, "Disabled"}},
212 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
213 {WATCH_DOG,
214 {{"Global Watch Dog",
215 {{1, {log_level::OK, "Normal"}},
216 {2, {log_level::CRITICAL, "Timer Expired"}}}},
217 {"Secure Watch Dog",
218 {{1, {log_level::OK, "Normal"}},
219 {2, {log_level::CRITICAL, "Timer Expired"}}}},
220 {"Non-secure Watch Dog",
221 {{1, {log_level::OK, "Normal"}},
222 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
223
224 std::string
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)225 OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
226 {
227 std::string description;
228 if (!tidToSocketNameMap.contains(tid))
229 {
230 description += "TID " + std::to_string(tid) + ": ";
231 }
232 else
233 {
234 description += tidToSocketNameMap[tid] + ": ";
235 }
236
237 if (!sensorIdToStrMap.contains(sensorId))
238 {
239 description += "Sensor ID " + std::to_string(sensorId) + ": ";
240 }
241 else
242 {
243 description += sensorIdToStrMap[sensorId] + ": ";
244 }
245
246 return description;
247 }
248
sendJournalRedfish(const std::string & description,log_level & logLevel)249 void OemEventManager::sendJournalRedfish(const std::string& description,
250 log_level& logLevel)
251 {
252 if (description.empty())
253 {
254 return;
255 }
256
257 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
258 {
259 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
260 "DES", description);
261 return;
262 }
263 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
264 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
266 }
267
dimmIdxsToString(uint32_t dimmIdxs)268 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
269 {
270 std::string description;
271 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
272 {
273 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
274 {
275 description += " #" + std::to_string(bitIdx);
276 }
277 }
278 return description;
279 }
280
sensorIdToDIMMIdx(const uint16_t & sensorId)281 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
282 {
283 uint8_t dimmIdx = maxDIMMInstantNum;
284 int sensorId_Off = sensorId - 4;
285 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
286 ((sensorId_Off / 2) < maxDIMMInstantNum))
287 {
288 dimmIdx = sensorId_Off / 2;
289 }
290 return dimmIdx;
291 }
292
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)293 void OemEventManager::handleBootOverallEvent(
294 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
295 {
296 log_level logLevel{log_level::OK};
297 std::string description;
298 std::stringstream strStream;
299
300 uint8_t byte0 = (presentReading & 0x000000ff);
301 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
302 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
303 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
304 /*
305 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
306 * ATF BL32 and DDR initialization
307 */
308 if (bootStageToMsgMap.contains(byte3))
309 {
310 // Boot stage adding
311 description += bootStageToMsgMap[byte3];
312
313 switch (byte3)
314 {
315 case boot_stage::DDR_TRAINING:
316 if (byte0 >= ddrTrainingMsg.size())
317 {
318 logLevel = log_level::BIOSFWPANIC;
319 description += " unknown status";
320 }
321 else
322 {
323 description += ddrTrainingMsg[byte0];
324 }
325 if (0x01 == byte0)
326 {
327 // Add complete percentage
328 description += " at " + std::to_string(byte1) + "%";
329 }
330 break;
331 case boot_stage::S0_DDR_TRAINING_FAILURE:
332 case boot_stage::S1_DDR_TRAINING_FAILURE:
333 // ddr_training_status_msg()
334 logLevel = log_level::BIOSFWPANIC;
335 description += " at DIMMs:";
336 // dimmIdxs = presentReading & 0x00ffffff;
337 description += dimmIdxsToString(presentReading & 0x00ffffff);
338 description += " of socket ";
339 description +=
340 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
341 break;
342 default:
343 if (byte0 >= bootStatMsg.size())
344 {
345 logLevel = log_level::BIOSFWPANIC;
346 description += " unknown status";
347 }
348 else
349 {
350 description += bootStatMsg[byte0];
351 }
352 break;
353 }
354
355 // Sensor report action is fail
356 if (boot::status::BOOT_STATUS_FAILURE == byte2)
357 {
358 logLevel = log_level::BIOSFWPANIC;
359 }
360 }
361 else
362 {
363 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
364 {
365 description +=
366 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
367
368 strStream
369 << "Segment (0x" << std::setfill('0') << std::hex
370 << std::setw(8) << static_cast<uint32_t>(presentReading)
371 << "); Status Class (0x" << std::setw(2)
372 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
373 << std::setw(2) << static_cast<uint32_t>(byte2)
374 << "); Operation Code (0x" << std::setw(4)
375 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
376 << ")" << std::dec;
377
378 description += strStream.str();
379 }
380 }
381
382 // Log to Redfish event
383 sendJournalRedfish(description, logLevel);
384 }
385
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)386 int OemEventManager::processNumericSensorEvent(
387 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
388 size_t sensorDataLength)
389 {
390 uint8_t eventState = 0;
391 uint8_t previousEventState = 0;
392 uint8_t sensorDataSize = 0;
393 uint32_t presentReading;
394 auto rc = decode_numeric_sensor_data(
395 sensorData, sensorDataLength, &eventState, &previousEventState,
396 &sensorDataSize, &presentReading);
397 if (rc)
398 {
399 lg2::error(
400 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
401 "TID", tid, "RC", rc);
402 return rc;
403 }
404
405 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
406 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
407 {
408 handleDIMMStatusEvent(tid, sensorId, presentReading);
409 return PLDM_SUCCESS;
410 }
411
412 switch (sensorId)
413 {
414 case BOOT_OVERALL:
415 handleBootOverallEvent(tid, sensorId, presentReading);
416 break;
417 case PCIE_HOT_PLUG:
418 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
419 break;
420 case DDR_STATUS:
421 handleDDRStatusEvent(tid, sensorId, presentReading);
422 break;
423 case PCP_VR_STATE:
424 case SOC_VR_STATE:
425 case DPHY_VR1_STATE:
426 case DPHY_VR2_STATE:
427 case D2D_VR_STATE:
428 case IOC_VR1_STATE:
429 case IOC_VR2_STATE:
430 case PCI_D_VR_STATE:
431 case PCI_A_VR_STATE:
432 handleVRDStatusEvent(tid, sensorId, presentReading);
433 break;
434 case WATCH_DOG:
435 handleNumericWatchdogEvent(tid, sensorId, presentReading);
436 break;
437 default:
438 std::string description;
439 std::stringstream strStream;
440 log_level logLevel = log_level::OK;
441
442 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
443 description += prefixMsgStrCreation(tid, sensorId);
444 strStream << std::setfill('0') << std::hex << "eventState 0x"
445 << std::setw(2) << static_cast<uint32_t>(eventState)
446 << " previousEventState 0x" << std::setw(2)
447 << static_cast<uint32_t>(previousEventState)
448 << " sensorDataSize 0x" << std::setw(2)
449 << static_cast<uint32_t>(sensorDataSize)
450 << " presentReading 0x" << std::setw(8)
451 << static_cast<uint32_t>(presentReading) << std::dec;
452 description += strStream.str();
453
454 sendJournalRedfish(description, logLevel);
455 break;
456 }
457 return PLDM_SUCCESS;
458 }
459
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)460 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
461 const uint8_t* sensorData,
462 size_t sensorDataLength)
463 {
464 uint8_t sensorOffset = 0;
465 uint8_t eventState = 0;
466 uint8_t previousEventState = 0;
467
468 auto rc =
469 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
470 &eventState, &previousEventState);
471 if (rc)
472 {
473 lg2::error(
474 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
475 "TID", tid, "RC", rc);
476 return rc;
477 }
478
479 std::string description;
480 log_level logLevel = log_level::OK;
481
482 if (stateSensorToMsgMap.contains(sensorId))
483 {
484 description += prefixMsgStrCreation(tid, sensorId);
485 auto componentMap = stateSensorToMsgMap[sensorId];
486 if (sensorOffset < componentMap.size())
487 {
488 description += std::get<0>(componentMap[sensorOffset]);
489 auto stateMap = std::get<1>(componentMap[sensorOffset]);
490 if (stateMap.contains(eventState))
491 {
492 logLevel = std::get<0>(stateMap[eventState]);
493 description += " state : " + std::get<1>(stateMap[eventState]);
494 if (stateMap.contains(previousEventState))
495 {
496 description += "; previous state: " +
497 std::get<1>(stateMap[previousEventState]);
498 }
499 }
500 else
501 {
502 description += " sends unsupported event state: " +
503 std::to_string(eventState);
504 if (stateMap.contains(previousEventState))
505 {
506 description += "; previous state: " +
507 std::get<1>(stateMap[previousEventState]);
508 }
509 }
510 }
511 else
512 {
513 description += "sends unsupported component sensor offset " +
514 std::to_string(sensorOffset);
515 }
516 }
517 else
518 {
519 std::stringstream strStream;
520 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
521 description += prefixMsgStrCreation(tid, sensorId);
522 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
523 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
524 << "eventState 0x" << std::setw(2)
525 << static_cast<uint32_t>(eventState)
526 << " previousEventState 0x" << std::setw(2)
527 << static_cast<uint32_t>(previousEventState) << std::dec;
528 description += strStream.str();
529 }
530
531 sendJournalRedfish(description, logLevel);
532
533 return PLDM_SUCCESS;
534 }
535
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)536 int OemEventManager::processSensorOpStateEvent(
537 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
538 size_t sensorDataLength)
539 {
540 uint8_t present_op_state = 0;
541 uint8_t previous_op_state = 0;
542
543 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
544 &present_op_state, &previous_op_state);
545 if (rc)
546 {
547 lg2::error(
548 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
549 "TID", tid, "RC", rc);
550 return rc;
551 }
552
553 std::string description;
554 std::stringstream strStream;
555 log_level logLevel = log_level::OK;
556
557 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
558 description += prefixMsgStrCreation(tid, sensorId);
559 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
560 << std::setw(2) << static_cast<uint32_t>(present_op_state)
561 << "previous_op_state 0x" << std::setw(2)
562 << static_cast<uint32_t>(previous_op_state) << std::dec;
563 description += strStream.str();
564
565 sendJournalRedfish(description, logLevel);
566
567 return PLDM_SUCCESS;
568 }
569
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)570 int OemEventManager::handleSensorEvent(
571 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
572 pldm_tid_t tid, size_t eventDataOffset)
573 {
574 /* This OEM event handler is only used for SoC terminus*/
575 if (!tidToSocketNameMap.contains(tid))
576 {
577 return PLDM_SUCCESS;
578 }
579 auto eventData =
580 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
581 auto eventDataSize = payloadLength - eventDataOffset;
582
583 uint16_t sensorId = 0;
584 uint8_t sensorEventClassType = 0;
585 size_t eventClassDataOffset = 0;
586 auto rc =
587 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
588 &sensorEventClassType, &eventClassDataOffset);
589 if (rc)
590 {
591 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
592 rc);
593 return rc;
594 }
595 const uint8_t* sensorData = eventData + eventClassDataOffset;
596 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
597
598 switch (sensorEventClassType)
599 {
600 case PLDM_NUMERIC_SENSOR_STATE:
601 {
602 return processNumericSensorEvent(tid, sensorId, sensorData,
603 sensorDataLength);
604 }
605 case PLDM_STATE_SENSOR_STATE:
606 {
607 return processStateSensorEvent(tid, sensorId, sensorData,
608 sensorDataLength);
609 }
610 case PLDM_SENSOR_OP_STATE:
611 {
612 return processSensorOpStateEvent(tid, sensorId, sensorData,
613 sensorDataLength);
614 }
615 default:
616 std::string description;
617 std::stringstream strStream;
618 log_level logLevel = log_level::OK;
619
620 description += "SENSOR_EVENT : Unsupported Sensor Class " +
621 std::to_string(sensorEventClassType) + ": ";
622 description += prefixMsgStrCreation(tid, sensorId);
623 strStream << std::setfill('0') << std::hex
624 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
625
626 auto dataPtr = sensorData;
627 for ([[maybe_unused]] const auto& i :
628 std::views::iota(0, (int)sensorDataLength))
629 {
630 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
631 dataPtr += sizeof(sensorData);
632 }
633
634 description += strStream.str();
635
636 sendJournalRedfish(description, logLevel);
637 }
638 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
639 sensorEventClassType);
640 return PLDM_ERROR;
641 }
642
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)643 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
644 uint32_t presentReading)
645 {
646 std::string description;
647 std::stringstream strStream;
648 PCIeHotPlugEventRecord_t record{presentReading};
649
650 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
651 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
652 log_level logLevel =
653 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
654
655 description += prefixMsgStrCreation(tid, sensorId);
656
657 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
658 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
659 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
660 << "); Device (0x" << std::setw(2)
661 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
662 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
663 << "); Action (" << sAction << "); Operation status ("
664 << sOpStatus << "); Media slot number (" << std::dec
665 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
666
667 description += strStream.str();
668
669 // Log to Redfish event
670 sendJournalRedfish(description, logLevel);
671 }
672
dimmTrainingFailureToMsg(uint32_t failureInfo)673 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
674 {
675 std::string description;
676 DIMMTrainingFailure_t failure{failureInfo};
677
678 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
679 {
680 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
681
682 description += std::get<0>(failureInfoMap);
683
684 description += "; MCU rank index " +
685 std::to_string(failure.bits.mcuRankIdx);
686
687 description += "; Slice number " +
688 std::to_string(failure.bits.sliceNum);
689
690 description += "; Upper nibble error status: ";
691 description += (!failure.bits.upperNibbStatErr)
692 ? "No error"
693 : "Found no rising edge";
694
695 description += "; Lower nibble error status: ";
696 description += (!failure.bits.lowerNibbStatErr)
697 ? "No error"
698 : "Found no rising edge";
699
700 description += "; Failure syndrome 0: ";
701
702 auto& syndromeMap = std::get<1>(failureInfoMap);
703 if (syndromeMap.contains(failure.bits.syndrome))
704 {
705 description += syndromeMap[failure.bits.syndrome];
706 }
707 else
708 {
709 description += "(Unknown syndrome)";
710 }
711 }
712 else
713 {
714 description += "Unknown training failure type " +
715 std::to_string(failure.bits.type);
716 }
717
718 return description;
719 }
720
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)721 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
722 uint32_t presentReading)
723 {
724 log_level logLevel{log_level::WARNING};
725 std::string description;
726 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
727 uint32_t byte012 = presentReading & 0xffffff;
728
729 description += prefixMsgStrCreation(tid, sensorId);
730
731 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
732 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
733 if (dimmIdx >= maxDIMMIdxBitNum)
734 {
735 return;
736 }
737
738 description += "DIMM " + std::to_string(dimmIdx) + " ";
739
740 if (dimmStatusToMsgMap.contains(byte3))
741 {
742 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
743 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
744 {
745 logLevel = log_level::OK;
746 }
747
748 description += dimmStatusToMsgMap[byte3];
749
750 if (byte3 == dimm_status::TRAINING_FAILURE)
751 {
752 description += "; " + dimmTrainingFailureToMsg(byte012);
753 }
754 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
755 {
756 uint8_t byte0 = (byte012 & 0xff);
757 if (byte0 < pmicTempAlertMsg.size())
758 {
759 description += ": " + pmicTempAlertMsg[byte0];
760 }
761 }
762 }
763 else
764 {
765 switch (byte3)
766 {
767 case dimm_status::PMIC_HIGH_TEMP:
768 if (byte012 == 0x01)
769 {
770 description += "has PMIC high temp condition";
771 }
772 break;
773 case dimm_status::TSx_HIGH_TEMP:
774 switch (byte012)
775 {
776 case 0x01:
777 description += "has TS0";
778 break;
779 case 0x02:
780 description += "has TS1";
781 break;
782 case 0x03:
783 description += "has TS0 and TS1";
784 break;
785 }
786 description += " exceeding their high temperature threshold";
787 break;
788 case dimm_status::SPD_HUB_HIGH_TEMP:
789 if (byte012 == 0x01)
790 {
791 description += "has SPD/HUB high temp condition";
792 }
793 break;
794 default:
795 description += "has unsupported status " +
796 std::to_string(byte3);
797 break;
798 }
799 }
800
801 // Log to Redfish event
802 sendJournalRedfish(description, logLevel);
803 }
804
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)805 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
806 uint32_t presentReading)
807 {
808 log_level logLevel{log_level::WARNING};
809 std::string description;
810 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
811 uint32_t byte012 = presentReading & 0xffffff;
812
813 description += prefixMsgStrCreation(tid, sensorId);
814
815 description += "DDR ";
816 if (ddrStatusToMsgMap.contains(byte3))
817 {
818 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
819 {
820 logLevel = log_level::OK;
821 }
822
823 description += ddrStatusToMsgMap[byte3];
824
825 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
826 byte3 == ddr_status::TRAINING_FAILURE)
827 {
828 // List out failed DIMMs
829 description += dimmIdxsToString(byte012);
830 }
831 }
832 else
833 {
834 description += "has unsupported status " + std::to_string(byte3);
835 }
836
837 // Log to Redfish event
838 sendJournalRedfish(description, logLevel);
839 }
840
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)841 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
842 uint32_t presentReading)
843 {
844 log_level logLevel{log_level::WARNING};
845 std::string description;
846 std::stringstream strStream;
847
848 description += prefixMsgStrCreation(tid, sensorId);
849
850 VRDStatus_t status{presentReading};
851
852 if (status.bits.warning && status.bits.critical)
853 {
854 description += "A VR warning and a VR critical";
855 logLevel = log_level::CRITICAL;
856 }
857 else
858 {
859 if (status.bits.warning)
860 {
861 description += "A VR warning";
862 }
863 else if (status.bits.critical)
864 {
865 description += "A VR critical";
866 logLevel = log_level::CRITICAL;
867 }
868 else
869 {
870 description += "No VR warning or critical";
871 logLevel = log_level::OK;
872 }
873 }
874 description += " condition observed";
875
876 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
877 << std::setw(2)
878 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
879 << "; VR status byte low is 0x" << std::setw(2)
880 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
881 << "; Reading is 0x" << std::setw(2)
882 << static_cast<uint32_t>(presentReading) << ";";
883
884 description += strStream.str();
885
886 // Log to Redfish event
887 sendJournalRedfish(description, logLevel);
888 }
889
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)890 void OemEventManager::handleNumericWatchdogEvent(
891 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
892 {
893 std::string description;
894 log_level logLevel = log_level::CRITICAL;
895
896 description += prefixMsgStrCreation(tid, sensorId);
897
898 if (presentReading & 0x01)
899 {
900 description += "Global watchdog expired;";
901 }
902 if (presentReading & 0x02)
903 {
904 description += "Secure watchdog expired;";
905 }
906 if (presentReading & 0x04)
907 {
908 description += "Non-secure watchdog expired;";
909 }
910
911 // Log to Redfish event
912 sendJournalRedfish(description, logLevel);
913 }
914
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)915 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
916 const uint8_t* eventData,
917 size_t eventDataSize)
918 {
919 EFI_AMPERE_ERROR_DATA ampHdr;
920
921 decodeCperRecord(eventData, eventDataSize, &Hdr);
922
923 addCperSELLog(tid, eventId, &Hdr);
924
925 /* isBert at bit 12 of TypeId */
926 if (ampHdr.TypeId & 0x0800)
927 {
928 lg2::info("Ampere SoC BERT is triggered.");
929 std::variant<std::string> value(
930 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
931 try
932 {
933 auto& bus = pldm::utils::DBusHandler::getBus();
934 auto method =
935 bus.new_method_call("com.ampere.CrashCapture.Trigger",
936 "/com/ampere/crashcapture/trigger",
937 pldm::utils::dbusProperties, "Set");
938 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
939 value);
940 bus.call_noreply(method);
941 }
942 catch (const std::exception& e)
943 {
944 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
945 }
946 }
947
948 return PLDM_SUCCESS;
949 }
950
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)951 int OemEventManager::handlepldmMessagePollEvent(
952 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
953 pldm_tid_t tid, size_t eventDataOffset)
954 {
955 /* This OEM event handler is only used for SoC terminus*/
956 if (!tidToSocketNameMap.contains(tid))
957 {
958 return PLDM_SUCCESS;
959 }
960
961 auto eventData =
962 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
963 auto eventDataSize = payloadLength - eventDataOffset;
964
965 pldm_message_poll_event poll_event{};
966 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
967 &poll_event);
968 if (rc)
969 {
970 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
971 "RC", rc);
972 return rc;
973 }
974
975 auto sensorID = poll_event.event_id;
976 /* The UE errors */
977 if (rasUESensorIDs.contains(sensorID))
978 {
979 pldm::utils::DBusMapping dbusMapping{
980 "/xyz/openbmc_project/led/groups/ras_ue_fault",
981 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
982 try
983 {
984 pldm::utils::DBusHandler().setDbusProperty(
985 dbusMapping, pldm::utils::PropertyValue{bool(true)});
986 }
987 catch (const std::exception& e)
988 {
989 lg2::error(
990 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
991 "TID", tid, "SENSORID", sensorID, "ERROR", e);
992 }
993 }
994
995 return PLDM_SUCCESS;
996 }
997
oemPollForPlatformEvent(pldm_tid_t tid)998 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
999 {
1000 uint64_t t0 = 0;
1001
1002 /* This OEM event handler is only used for SoC terminus */
1003 if (!tidToSocketNameMap.contains(tid))
1004 {
1005 co_return PLDM_SUCCESS;
1006 }
1007
1008 if (!timeStampMap.contains(tid))
1009 {
1010 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1011 timeStampMap.emplace(std::make_pair(tid, t0));
1012 }
1013 else
1014 {
1015 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1016 uint64_t elapsed = t0 - timeStampMap[tid];
1017 if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1018 {
1019 co_await manager->pollForPlatformEvent(tid, 0, 0);
1020 timeStampMap[tid] = t0;
1021 }
1022 }
1023
1024 co_return PLDM_SUCCESS;
1025 }
1026 } // namespace oem_ampere
1027 } // namespace pldm
1028