1 #include "oem_event_manager.hpp"
2
3 #include "libcper/Cper.h"
4
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace boot_stage = boot::stage;
29 namespace ddr_status = ddr::status;
30 namespace dimm_status = dimm::status;
31 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
32 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
33 namespace training_failure = dimm::training_failure;
34
35 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent.OK";
36 constexpr const char* ampereWarningRegistry =
37 "OpenBMC.0.1.AmpereWarning.Warning";
38 constexpr const char* ampereCriticalRegistry =
39 "OpenBMC.0.1.AmpereCritical.Critical";
40 constexpr const char* BIOSFWPanicRegistry =
41 "OpenBMC.0.1.BIOSFirmwarePanicReason.Warning";
42 constexpr auto maxDIMMIdxBitNum = 24;
43 constexpr auto maxDIMMInstantNum = 24;
44
45 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
46
47 /*
48 An array of possible boot status of a boot stage.
49 The index maps with byte 0 of boot code.
50 */
51 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
52
53 /*
54 An array of possible boot status of DDR training stage.
55 The index maps with byte 0 of boot code.
56 */
57 std::array<std::string, 3> ddrTrainingMsg = {
58 " progress started", " in-progress", " progress completed"};
59
60 /*
61 A map between PMIC status and logging strings.
62 */
63 std::array<std::string, 8> pmicTempAlertMsg = {
64 "Below 85°C", "85°C", "95°C", "105°C",
65 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
66
67 /*
68 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
69 EPs through SMBus and PCIe. When host boots up, SMBUS interface
70 comes up first. In this interface, BMC is bus owner.
71
72 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
73 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
74 */
75 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
76
77 /*
78 A map between sensor IDs and their names in string.
79 Using pldm::oem::sensor_ids
80 */
81 EventToMsgMap_t sensorIdToStrMap = {
82 {DDR_STATUS, "DDR_STATUS"},
83 {PCP_VR_STATE, "PCP_VR_STATE"},
84 {SOC_VR_STATE, "SOC_VR_STATE"},
85 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
86 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
87 {D2D_VR_STATE, "D2D_VR_STATE"},
88 {IOC_VR1_STATE, "IOC_VR1_STATE"},
89 {IOC_VR2_STATE, "IOC_VR2_STATE"},
90 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
91 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
92 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
93 {BOOT_OVERALL, "BOOT_OVERALL"},
94 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
95 {WATCH_DOG, "WATCH_DOG"}};
96
97 /*
98 A map between the boot stages and logging strings.
99 Using pldm::oem::boot::stage::boot_stage
100 */
101 EventToMsgMap_t bootStageToMsgMap = {
102 {boot_stage::SECPRO, "SECpro"},
103 {boot_stage::MPRO, "Mpro"},
104 {boot_stage::ATF_BL1, "ATF BL1"},
105 {boot_stage::ATF_BL2, "ATF BL2"},
106 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
107 {boot_stage::DDR_TRAINING, "DDR training"},
108 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
109 {boot_stage::ATF_BL31, "ATF BL31"},
110 {boot_stage::ATF_BL32, "ATF BL32"},
111 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
112 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
113 "ATF BL33 (UEFI) booting status = "}};
114
115 /*
116 A map between DDR status and logging strings.
117 Using pldm::oem::ddr::status::ddr_status
118 */
119 EventToMsgMap_t ddrStatusToMsgMap = {
120 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
121 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
122 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
123 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
124 {ddr_status::OTHER_FAILURE, "has other failure"},
125 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
126 "has boot failure due to no configuration"},
127 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
128 "failsafe activated but boot success with the next valid configuration"}};
129
130 /*
131 A map between DIMM status and logging strings.
132 Using pldm::oem::dimm::status::dimm_status
133 */
134 EventToMsgMap_t dimmStatusToMsgMap = {
135 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
136 {dimm_status::NOT_INSTALLED, "is not installed"},
137 {dimm_status::OTHER_FAILURE, "has other failure"},
138 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
139 {dimm_status::TRAINING_FAILURE, "has training failure; "},
140 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
141
142 /*
143 A map between PHY training failure syndrome and logging strings.
144 Using
145 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
146 */
147 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
148 {phy_syndrome::NA, "(N/A)"},
149 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
150 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
151 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
152 "(PHY write level failure - see syndrome 1)"},
153 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
154 "(PHY read gate leveling failure)"},
155 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
156 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
157 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
158
159 /*
160 A map between DIMM training failure syndrome and logging strings.
161 Using
162 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
163 */
164 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
165 {dimm_syndrome::NA, "(N/A)"},
166 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
167 "(DRAM VREFDQ training failure)"},
168 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
169 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
170 "(LRDRIMM DB SW training failure)"}};
171
172 /*
173 A map between DIMM training failure type and a pair of <logging strings -
174 syndrome map>. Using
175 pldm::oem::dimm::training_faillure::dimm_training_failure_type
176 */
177 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
178 dimmTrainingFailureTypeMap = {
179 {training_failure::PHY_TRAINING_FAILURE_TYPE,
180 std::make_pair("PHY training failure",
181 phyTrainingFailureSyndromeToMsgMap)},
182 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
183 std::make_pair("DIMM training failure",
184 dimmTrainingFailureSyndromeToMsgMap)}};
185
186 /*
187 A map between log level and the registry used for Redfish SEL log
188 Using pldm::oem::log_level
189 */
190 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
191 {log_level::OK, ampereEventRegistry},
192 {log_level::WARNING, ampereWarningRegistry},
193 {log_level::CRITICAL, ampereCriticalRegistry},
194 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
195
196 std::unordered_map<
197 uint16_t,
198 std::vector<std::pair<
199 std::string,
200 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
201 stateSensorToMsgMap = {
202 {SOC_HEALTH_AVAILABILITY,
203 {{"SoC Health",
204 {{1, {log_level::OK, "Normal"}},
205 {2, {log_level::WARNING, "Non-Critical"}},
206 {3, {log_level::CRITICAL, "Critical"}},
207 {4, {log_level::CRITICAL, "Fatal"}}}},
208 {"SoC Availability",
209 {{1, {log_level::OK, "Enabled"}},
210 {2, {log_level::WARNING, "Disabled"}},
211 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
212 {WATCH_DOG,
213 {{"Global Watch Dog",
214 {{1, {log_level::OK, "Normal"}},
215 {2, {log_level::CRITICAL, "Timer Expired"}}}},
216 {"Secure Watch Dog",
217 {{1, {log_level::OK, "Normal"}},
218 {2, {log_level::CRITICAL, "Timer Expired"}}}},
219 {"Non-secure Watch Dog",
220 {{1, {log_level::OK, "Normal"}},
221 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
222
223 std::string
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)224 OemEventManager::prefixMsgStrCreation(pldm_tid_t tid, uint16_t sensorId)
225 {
226 std::string description;
227 if (!tidToSocketNameMap.contains(tid))
228 {
229 description += "TID " + std::to_string(tid) + ": ";
230 }
231 else
232 {
233 description += tidToSocketNameMap[tid] + ": ";
234 }
235
236 if (!sensorIdToStrMap.contains(sensorId))
237 {
238 description += "Sensor ID " + std::to_string(sensorId) + ": ";
239 }
240 else
241 {
242 description += sensorIdToStrMap[sensorId] + ": ";
243 }
244
245 return description;
246 }
247
sendJournalRedfish(const std::string & description,log_level & logLevel)248 void OemEventManager::sendJournalRedfish(const std::string& description,
249 log_level& logLevel)
250 {
251 if (description.empty())
252 {
253 return;
254 }
255
256 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
257 {
258 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
259 "DES", description);
260 return;
261 }
262 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
263 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
264 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
265 }
266
dimmIdxsToString(uint32_t dimmIdxs)267 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
268 {
269 std::string description;
270 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
271 {
272 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
273 {
274 description += " #" + std::to_string(bitIdx);
275 }
276 }
277 return description;
278 }
279
sensorIdToDIMMIdx(const uint16_t & sensorId)280 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
281 {
282 uint8_t dimmIdx = maxDIMMInstantNum;
283 int sensorId_Off = sensorId - 4;
284 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
285 ((sensorId_Off / 2) < maxDIMMInstantNum))
286 {
287 dimmIdx = sensorId_Off / 2;
288 }
289 return dimmIdx;
290 }
291
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)292 void OemEventManager::handleBootOverallEvent(
293 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
294 {
295 log_level logLevel{log_level::OK};
296 std::string description;
297 std::stringstream strStream;
298
299 uint8_t byte0 = (presentReading & 0x000000ff);
300 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
301 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
302 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
303 /*
304 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
305 * ATF BL32 and DDR initialization
306 */
307 if (bootStageToMsgMap.contains(byte3))
308 {
309 // Boot stage adding
310 description += bootStageToMsgMap[byte3];
311
312 switch (byte3)
313 {
314 case boot_stage::DDR_TRAINING:
315 if (byte0 >= ddrTrainingMsg.size())
316 {
317 logLevel = log_level::BIOSFWPANIC;
318 description += " unknown status";
319 }
320 else
321 {
322 description += ddrTrainingMsg[byte0];
323 }
324 if (0x01 == byte0)
325 {
326 // Add complete percentage
327 description += " at " + std::to_string(byte1) + "%";
328 }
329 break;
330 case boot_stage::S0_DDR_TRAINING_FAILURE:
331 case boot_stage::S1_DDR_TRAINING_FAILURE:
332 // ddr_training_status_msg()
333 logLevel = log_level::BIOSFWPANIC;
334 description += " at DIMMs:";
335 // dimmIdxs = presentReading & 0x00ffffff;
336 description += dimmIdxsToString(presentReading & 0x00ffffff);
337 description += " of socket ";
338 description +=
339 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
340 break;
341 default:
342 if (byte0 >= bootStatMsg.size())
343 {
344 logLevel = log_level::BIOSFWPANIC;
345 description += " unknown status";
346 }
347 else
348 {
349 description += bootStatMsg[byte0];
350 }
351 break;
352 }
353
354 // Sensor report action is fail
355 if (boot::status::BOOT_STATUS_FAILURE == byte2)
356 {
357 logLevel = log_level::BIOSFWPANIC;
358 }
359 }
360 else
361 {
362 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
363 {
364 description +=
365 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
366
367 strStream
368 << "Segment (0x" << std::setfill('0') << std::hex
369 << std::setw(8) << static_cast<uint32_t>(presentReading)
370 << "); Status Class (0x" << std::setw(2)
371 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
372 << std::setw(2) << static_cast<uint32_t>(byte2)
373 << "); Operation Code (0x" << std::setw(4)
374 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
375 << ")" << std::dec;
376
377 description += strStream.str();
378 }
379 }
380
381 // Log to Redfish event
382 sendJournalRedfish(description, logLevel);
383 }
384
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)385 int OemEventManager::processNumericSensorEvent(
386 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
387 size_t sensorDataLength)
388 {
389 uint8_t eventState = 0;
390 uint8_t previousEventState = 0;
391 uint8_t sensorDataSize = 0;
392 uint32_t presentReading;
393 auto rc = decode_numeric_sensor_data(
394 sensorData, sensorDataLength, &eventState, &previousEventState,
395 &sensorDataSize, &presentReading);
396 if (rc)
397 {
398 lg2::error(
399 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
400 "TID", tid, "RC", rc);
401 return rc;
402 }
403
404 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
405 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
406 {
407 handleDIMMStatusEvent(tid, sensorId, presentReading);
408 return PLDM_SUCCESS;
409 }
410
411 switch (sensorId)
412 {
413 case BOOT_OVERALL:
414 handleBootOverallEvent(tid, sensorId, presentReading);
415 break;
416 case PCIE_HOT_PLUG:
417 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
418 break;
419 case DDR_STATUS:
420 handleDDRStatusEvent(tid, sensorId, presentReading);
421 break;
422 case PCP_VR_STATE:
423 case SOC_VR_STATE:
424 case DPHY_VR1_STATE:
425 case DPHY_VR2_STATE:
426 case D2D_VR_STATE:
427 case IOC_VR1_STATE:
428 case IOC_VR2_STATE:
429 case PCI_D_VR_STATE:
430 case PCI_A_VR_STATE:
431 handleVRDStatusEvent(tid, sensorId, presentReading);
432 break;
433 case WATCH_DOG:
434 handleNumericWatchdogEvent(tid, sensorId, presentReading);
435 break;
436 default:
437 std::string description;
438 std::stringstream strStream;
439 log_level logLevel = log_level::OK;
440
441 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
442 description += prefixMsgStrCreation(tid, sensorId);
443 strStream << std::setfill('0') << std::hex << "eventState 0x"
444 << std::setw(2) << static_cast<uint32_t>(eventState)
445 << " previousEventState 0x" << std::setw(2)
446 << static_cast<uint32_t>(previousEventState)
447 << " sensorDataSize 0x" << std::setw(2)
448 << static_cast<uint32_t>(sensorDataSize)
449 << " presentReading 0x" << std::setw(8)
450 << static_cast<uint32_t>(presentReading) << std::dec;
451 description += strStream.str();
452
453 sendJournalRedfish(description, logLevel);
454 break;
455 }
456 return PLDM_SUCCESS;
457 }
458
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)459 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
460 const uint8_t* sensorData,
461 size_t sensorDataLength)
462 {
463 uint8_t sensorOffset = 0;
464 uint8_t eventState = 0;
465 uint8_t previousEventState = 0;
466
467 auto rc =
468 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
469 &eventState, &previousEventState);
470 if (rc)
471 {
472 lg2::error(
473 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
474 "TID", tid, "RC", rc);
475 return rc;
476 }
477
478 std::string description;
479 log_level logLevel = log_level::OK;
480
481 if (stateSensorToMsgMap.contains(sensorId))
482 {
483 description += prefixMsgStrCreation(tid, sensorId);
484 auto componentMap = stateSensorToMsgMap[sensorId];
485 if (sensorOffset < componentMap.size())
486 {
487 description += std::get<0>(componentMap[sensorOffset]);
488 auto stateMap = std::get<1>(componentMap[sensorOffset]);
489 if (stateMap.contains(eventState))
490 {
491 logLevel = std::get<0>(stateMap[eventState]);
492 description += " state : " + std::get<1>(stateMap[eventState]);
493 if (stateMap.contains(previousEventState))
494 {
495 description += "; previous state: " +
496 std::get<1>(stateMap[previousEventState]);
497 }
498 }
499 else
500 {
501 description += " sends unsupported event state: " +
502 std::to_string(eventState);
503 if (stateMap.contains(previousEventState))
504 {
505 description += "; previous state: " +
506 std::get<1>(stateMap[previousEventState]);
507 }
508 }
509 }
510 else
511 {
512 description += "sends unsupported component sensor offset " +
513 std::to_string(sensorOffset);
514 }
515 }
516 else
517 {
518 std::stringstream strStream;
519 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
520 description += prefixMsgStrCreation(tid, sensorId);
521 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
522 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
523 << "eventState 0x" << std::setw(2)
524 << static_cast<uint32_t>(eventState)
525 << " previousEventState 0x" << std::setw(2)
526 << static_cast<uint32_t>(previousEventState) << std::dec;
527 description += strStream.str();
528 }
529
530 sendJournalRedfish(description, logLevel);
531
532 return PLDM_SUCCESS;
533 }
534
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)535 int OemEventManager::processSensorOpStateEvent(
536 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
537 size_t sensorDataLength)
538 {
539 uint8_t present_op_state = 0;
540 uint8_t previous_op_state = 0;
541
542 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
543 &present_op_state, &previous_op_state);
544 if (rc)
545 {
546 lg2::error(
547 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
548 "TID", tid, "RC", rc);
549 return rc;
550 }
551
552 std::string description;
553 std::stringstream strStream;
554 log_level logLevel = log_level::OK;
555
556 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
557 description += prefixMsgStrCreation(tid, sensorId);
558 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
559 << std::setw(2) << static_cast<uint32_t>(present_op_state)
560 << "previous_op_state 0x" << std::setw(2)
561 << static_cast<uint32_t>(previous_op_state) << std::dec;
562 description += strStream.str();
563
564 sendJournalRedfish(description, logLevel);
565
566 return PLDM_SUCCESS;
567 }
568
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)569 int OemEventManager::handleSensorEvent(
570 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
571 pldm_tid_t tid, size_t eventDataOffset)
572 {
573 /* This OEM event handler is only used for SoC terminus*/
574 if (!tidToSocketNameMap.contains(tid))
575 {
576 return PLDM_SUCCESS;
577 }
578 auto eventData =
579 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
580 auto eventDataSize = payloadLength - eventDataOffset;
581
582 uint16_t sensorId = 0;
583 uint8_t sensorEventClassType = 0;
584 size_t eventClassDataOffset = 0;
585 auto rc =
586 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
587 &sensorEventClassType, &eventClassDataOffset);
588 if (rc)
589 {
590 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
591 rc);
592 return rc;
593 }
594 const uint8_t* sensorData = eventData + eventClassDataOffset;
595 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
596
597 switch (sensorEventClassType)
598 {
599 case PLDM_NUMERIC_SENSOR_STATE:
600 {
601 return processNumericSensorEvent(tid, sensorId, sensorData,
602 sensorDataLength);
603 }
604 case PLDM_STATE_SENSOR_STATE:
605 {
606 return processStateSensorEvent(tid, sensorId, sensorData,
607 sensorDataLength);
608 }
609 case PLDM_SENSOR_OP_STATE:
610 {
611 return processSensorOpStateEvent(tid, sensorId, sensorData,
612 sensorDataLength);
613 }
614 default:
615 std::string description;
616 std::stringstream strStream;
617 log_level logLevel = log_level::OK;
618
619 description += "SENSOR_EVENT : Unsupported Sensor Class " +
620 std::to_string(sensorEventClassType) + ": ";
621 description += prefixMsgStrCreation(tid, sensorId);
622 strStream << std::setfill('0') << std::hex
623 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
624
625 auto dataPtr = sensorData;
626 for ([[maybe_unused]] const auto& i :
627 std::views::iota(0, (int)sensorDataLength))
628 {
629 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
630 dataPtr += sizeof(sensorData);
631 }
632
633 description += strStream.str();
634
635 sendJournalRedfish(description, logLevel);
636 }
637 lg2::info("Unsupported class type {CLASSTYPE}", "CLASSTYPE",
638 sensorEventClassType);
639 return PLDM_ERROR;
640 }
641
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)642 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
643 uint32_t presentReading)
644 {
645 std::string description;
646 std::stringstream strStream;
647 PCIeHotPlugEventRecord_t record{presentReading};
648
649 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
650 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
651 log_level logLevel =
652 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
653
654 description += prefixMsgStrCreation(tid, sensorId);
655
656 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
657 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
658 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
659 << "); Device (0x" << std::setw(2)
660 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
661 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
662 << "); Action (" << sAction << "); Operation status ("
663 << sOpStatus << "); Media slot number (" << std::dec
664 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
665
666 description += strStream.str();
667
668 // Log to Redfish event
669 sendJournalRedfish(description, logLevel);
670 }
671
dimmTrainingFailureToMsg(uint32_t failureInfo)672 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
673 {
674 std::string description;
675 DIMMTrainingFailure_t failure{failureInfo};
676
677 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
678 {
679 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
680
681 description += std::get<0>(failureInfoMap);
682
683 description += "; MCU rank index " +
684 std::to_string(failure.bits.mcuRankIdx);
685
686 description += "; Slice number " +
687 std::to_string(failure.bits.sliceNum);
688
689 description += "; Upper nibble error status: ";
690 description += (!failure.bits.upperNibbStatErr)
691 ? "No error"
692 : "Found no rising edge";
693
694 description += "; Lower nibble error status: ";
695 description += (!failure.bits.lowerNibbStatErr)
696 ? "No error"
697 : "Found no rising edge";
698
699 description += "; Failure syndrome 0: ";
700
701 auto& syndromeMap = std::get<1>(failureInfoMap);
702 if (syndromeMap.contains(failure.bits.syndrome))
703 {
704 description += syndromeMap[failure.bits.syndrome];
705 }
706 else
707 {
708 description += "(Unknown syndrome)";
709 }
710 }
711 else
712 {
713 description += "Unknown training failure type " +
714 std::to_string(failure.bits.type);
715 }
716
717 return description;
718 }
719
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)720 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
721 uint32_t presentReading)
722 {
723 log_level logLevel{log_level::WARNING};
724 std::string description;
725 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
726 uint32_t byte012 = presentReading & 0xffffff;
727
728 description += prefixMsgStrCreation(tid, sensorId);
729
730 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
731 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
732 if (dimmIdx >= maxDIMMIdxBitNum)
733 {
734 return;
735 }
736
737 description += "DIMM " + std::to_string(dimmIdx) + " ";
738
739 if (dimmStatusToMsgMap.contains(byte3))
740 {
741 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
742 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
743 {
744 logLevel = log_level::OK;
745 }
746
747 description += dimmStatusToMsgMap[byte3];
748
749 if (byte3 == dimm_status::TRAINING_FAILURE)
750 {
751 description += "; " + dimmTrainingFailureToMsg(byte012);
752 }
753 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
754 {
755 uint8_t byte0 = (byte012 & 0xff);
756 if (byte0 < pmicTempAlertMsg.size())
757 {
758 description += ": " + pmicTempAlertMsg[byte0];
759 }
760 }
761 }
762 else
763 {
764 switch (byte3)
765 {
766 case dimm_status::PMIC_HIGH_TEMP:
767 if (byte012 == 0x01)
768 {
769 description += "has PMIC high temp condition";
770 }
771 break;
772 case dimm_status::TSx_HIGH_TEMP:
773 switch (byte012)
774 {
775 case 0x01:
776 description += "has TS0";
777 break;
778 case 0x02:
779 description += "has TS1";
780 break;
781 case 0x03:
782 description += "has TS0 and TS1";
783 break;
784 }
785 description += " exceeding their high temperature threshold";
786 break;
787 case dimm_status::SPD_HUB_HIGH_TEMP:
788 if (byte012 == 0x01)
789 {
790 description += "has SPD/HUB high temp condition";
791 }
792 break;
793 default:
794 description += "has unsupported status " +
795 std::to_string(byte3);
796 break;
797 }
798 }
799
800 // Log to Redfish event
801 sendJournalRedfish(description, logLevel);
802 }
803
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)804 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
805 uint32_t presentReading)
806 {
807 log_level logLevel{log_level::WARNING};
808 std::string description;
809 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
810 uint32_t byte012 = presentReading & 0xffffff;
811
812 description += prefixMsgStrCreation(tid, sensorId);
813
814 description += "DDR ";
815 if (ddrStatusToMsgMap.contains(byte3))
816 {
817 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
818 {
819 logLevel = log_level::OK;
820 }
821
822 description += ddrStatusToMsgMap[byte3];
823
824 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
825 byte3 == ddr_status::TRAINING_FAILURE)
826 {
827 // List out failed DIMMs
828 description += dimmIdxsToString(byte012);
829 }
830 }
831 else
832 {
833 description += "has unsupported status " + std::to_string(byte3);
834 }
835
836 // Log to Redfish event
837 sendJournalRedfish(description, logLevel);
838 }
839
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)840 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
841 uint32_t presentReading)
842 {
843 log_level logLevel{log_level::WARNING};
844 std::string description;
845 std::stringstream strStream;
846
847 description += prefixMsgStrCreation(tid, sensorId);
848
849 VRDStatus_t status{presentReading};
850
851 if (status.bits.warning && status.bits.critical)
852 {
853 description += "A VR warning and a VR critical";
854 logLevel = log_level::CRITICAL;
855 }
856 else
857 {
858 if (status.bits.warning)
859 {
860 description += "A VR warning";
861 }
862 else if (status.bits.critical)
863 {
864 description += "A VR critical";
865 logLevel = log_level::CRITICAL;
866 }
867 else
868 {
869 description += "No VR warning or critical";
870 logLevel = log_level::OK;
871 }
872 }
873 description += " condition observed";
874
875 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
876 << std::setw(2)
877 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
878 << "; VR status byte low is 0x" << std::setw(2)
879 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
880 << "; Reading is 0x" << std::setw(2)
881 << static_cast<uint32_t>(presentReading) << ";";
882
883 description += strStream.str();
884
885 // Log to Redfish event
886 sendJournalRedfish(description, logLevel);
887 }
888
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)889 void OemEventManager::handleNumericWatchdogEvent(
890 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
891 {
892 std::string description;
893 log_level logLevel = log_level::CRITICAL;
894
895 description += prefixMsgStrCreation(tid, sensorId);
896
897 if (presentReading & 0x01)
898 {
899 description += "Global watchdog expired;";
900 }
901 if (presentReading & 0x02)
902 {
903 description += "Secure watchdog expired;";
904 }
905 if (presentReading & 0x04)
906 {
907 description += "Non-secure watchdog expired;";
908 }
909
910 // Log to Redfish event
911 sendJournalRedfish(description, logLevel);
912 }
913
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)914 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
915 const uint8_t* eventData,
916 size_t eventDataSize)
917 {
918 EFI_AMPERE_ERROR_DATA ampHdr;
919
920 decodeCperRecord(eventData, eventDataSize, &Hdr);
921
922 addCperSELLog(tid, eventId, &Hdr);
923
924 /* isBert at bit 12 of TypeId */
925 if (ampHdr.TypeId & 0x0800)
926 {
927 lg2::info("Ampere SoC BERT is triggered.");
928 std::variant<std::string> value(
929 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
930 try
931 {
932 auto& bus = pldm::utils::DBusHandler::getBus();
933 auto method =
934 bus.new_method_call("com.ampere.CrashCapture.Trigger",
935 "/com/ampere/crashcapture/trigger",
936 pldm::utils::dbusProperties, "Set");
937 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
938 value);
939 bus.call_noreply(method);
940 }
941 catch (const std::exception& e)
942 {
943 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
944 }
945 }
946
947 return PLDM_SUCCESS;
948 }
949
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)950 int OemEventManager::handlepldmMessagePollEvent(
951 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
952 pldm_tid_t tid, size_t eventDataOffset)
953 {
954 /* This OEM event handler is only used for SoC terminus*/
955 if (!tidToSocketNameMap.contains(tid))
956 {
957 return PLDM_SUCCESS;
958 }
959
960 auto eventData =
961 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
962 auto eventDataSize = payloadLength - eventDataOffset;
963
964 pldm_message_poll_event poll_event{};
965 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
966 &poll_event);
967 if (rc)
968 {
969 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
970 "RC", rc);
971 return rc;
972 }
973
974 auto sensorID = poll_event.event_id;
975 /* The UE errors */
976 if (rasUESensorIDs.contains(sensorID))
977 {
978 pldm::utils::DBusMapping dbusMapping{
979 "/xyz/openbmc_project/led/groups/ras_ue_fault",
980 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
981 try
982 {
983 pldm::utils::DBusHandler().setDbusProperty(
984 dbusMapping, pldm::utils::PropertyValue{bool(true)});
985 }
986 catch (const std::exception& e)
987 {
988 lg2::error(
989 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
990 "TID", tid, "SENSORID", sensorID, "ERROR", e);
991 }
992 }
993
994 return PLDM_SUCCESS;
995 }
996
997 } // namespace oem_ampere
998 } // namespace pldm
999