1 #include "oem_event_manager.hpp"
2
3 #include "libcper/Cper.h"
4
5 #include "cper.hpp"
6 #include "requester/handler.hpp"
7 #include "requester/request.hpp"
8
9 #include <config.h>
10 #include <libpldm/pldm.h>
11 #include <libpldm/utils.h>
12 #include <systemd/sd-journal.h>
13
14 #include <phosphor-logging/lg2.hpp>
15 #include <xyz/openbmc_project/Logging/Entry/server.hpp>
16
17 #include <algorithm>
18 #include <map>
19 #include <set>
20 #include <sstream>
21 #include <string>
22 #include <unordered_map>
23
24 namespace pldm
25 {
26 namespace oem_ampere
27 {
28 namespace fs = std::filesystem;
29 using namespace std::chrono;
30
31 namespace boot_stage = boot::stage;
32 namespace ddr_status = ddr::status;
33 namespace dimm_status = dimm::status;
34 namespace dimm_syndrome = dimm::training_failure::dimm_syndrome;
35 namespace phy_syndrome = dimm::training_failure::phy_syndrome;
36 namespace training_failure = dimm::training_failure;
37
38 constexpr const char* ampereEventRegistry = "OpenBMC.0.1.AmpereEvent";
39 constexpr const char* ampereWarningRegistry = "OpenBMC.0.1.AmpereWarning";
40 constexpr const char* ampereCriticalRegistry = "OpenBMC.0.1.AmpereCritical";
41 constexpr const char* BIOSFWPanicRegistry =
42 "OpenBMC.0.1.BIOSFirmwarePanicReason";
43 constexpr auto maxDIMMIdxBitNum = 24;
44 constexpr auto maxDIMMInstantNum = 24;
45
46 const std::set<uint16_t> rasUESensorIDs = {CORE_UE, MCU_UE, PCIE_UE, SOC_UE};
47
48 /*
49 An array of possible boot status of a boot stage.
50 The index maps with byte 0 of boot code.
51 */
52 std::array<std::string, 3> bootStatMsg = {" booting", " completed", " failed"};
53
54 /*
55 An array of possible boot status of DDR training stage.
56 The index maps with byte 0 of boot code.
57 */
58 std::array<std::string, 3> ddrTrainingMsg = {
59 " progress started", " in-progress", " progress completed"};
60
61 /*
62 A map between PMIC status and logging strings.
63 */
64 std::array<std::string, 8> pmicTempAlertMsg = {
65 "Below 85°C", "85°C", "95°C", "105°C",
66 "115°C", "125°C", "135°C", "Equal or greater than 140°C"};
67
68 /*
69 In Ampere systems, BMC only directly communicates with MCTP/PLDM SoC
70 EPs through SMBus and PCIe. When host boots up, SMBUS interface
71 comes up first. In this interface, BMC is bus owner.
72
73 mctpd will set the EID 0x14 for S0 and 0x16 for S1 (if available).
74 pldmd will always use TID 1 for S0 and TID 2 for S1 (if available).
75 */
76 EventToMsgMap_t tidToSocketNameMap = {{1, "SOCKET 0"}, {2, "SOCKET 1"}};
77
78 /*
79 A map between sensor IDs and their names in string.
80 Using pldm::oem::sensor_ids
81 */
82 EventToMsgMap_t sensorIdToStrMap = {
83 {DDR_STATUS, "DDR_STATUS"},
84 {PCP_VR_STATE, "PCP_VR_STATE"},
85 {SOC_VR_STATE, "SOC_VR_STATE"},
86 {DPHY_VR1_STATE, "DPHY_VR1_STATE"},
87 {DPHY_VR2_STATE, "DPHY_VR2_STATE"},
88 {D2D_VR_STATE, "D2D_VR_STATE"},
89 {IOC_VR1_STATE, "IOC_VR1_STATE"},
90 {IOC_VR2_STATE, "IOC_VR2_STATE"},
91 {PCI_D_VR_STATE, "PCI_D_VR_STATE"},
92 {PCI_A_VR_STATE, "PCI_A_VR_STATE"},
93 {PCIE_HOT_PLUG, "PCIE_HOT_PLUG"},
94 {BOOT_OVERALL, "BOOT_OVERALL"},
95 {SOC_HEALTH_AVAILABILITY, "SOC_HEALTH_AVAILABILITY"},
96 {WATCH_DOG, "WATCH_DOG"}};
97
98 /*
99 A map between the boot stages and logging strings.
100 Using pldm::oem::boot::stage::boot_stage
101 */
102 EventToMsgMap_t bootStageToMsgMap = {
103 {boot_stage::SECPRO, "SECpro"},
104 {boot_stage::MPRO, "Mpro"},
105 {boot_stage::ATF_BL1, "ATF BL1"},
106 {boot_stage::ATF_BL2, "ATF BL2"},
107 {boot_stage::DDR_INITIALIZATION, "DDR initialization"},
108 {boot_stage::DDR_TRAINING, "DDR training"},
109 {boot_stage::S0_DDR_TRAINING_FAILURE, "DDR training failure"},
110 {boot_stage::ATF_BL31, "ATF BL31"},
111 {boot_stage::ATF_BL32, "ATF BL32"},
112 {boot_stage::S1_DDR_TRAINING_FAILURE, "DDR training failure"},
113 {boot_stage::UEFI_STATUS_CLASS_CODE_MIN,
114 "ATF BL33 (UEFI) booting status = "}};
115
116 /*
117 A map between DDR status and logging strings.
118 Using pldm::oem::ddr::status::ddr_status
119 */
120 EventToMsgMap_t ddrStatusToMsgMap = {
121 {ddr_status::NO_SYSTEM_LEVEL_ERROR, "has no system level error"},
122 {ddr_status::ECC_INITIALIZATION_FAILURE, "has ECC initialization failure"},
123 {ddr_status::CONFIGURATION_FAILURE, "has configuration failure at DIMMs:"},
124 {ddr_status::TRAINING_FAILURE, "has training failure at DIMMs:"},
125 {ddr_status::OTHER_FAILURE, "has other failure"},
126 {ddr_status::BOOT_FAILURE_NO_VALID_CONFIG,
127 "has boot failure due to no configuration"},
128 {ddr_status::FAILSAFE_ACTIVATED_NEXT_BOOT_SUCCESS,
129 "failsafe activated but boot success with the next valid configuration"}};
130
131 /*
132 A map between DIMM status and logging strings.
133 Using pldm::oem::dimm::status::dimm_status
134 */
135 EventToMsgMap_t dimmStatusToMsgMap = {
136 {dimm_status::INSTALLED_NO_ERROR, "is installed and no error"},
137 {dimm_status::NOT_INSTALLED, "is not installed"},
138 {dimm_status::OTHER_FAILURE, "has other failure"},
139 {dimm_status::INSTALLED_BUT_DISABLED, "is installed but disabled"},
140 {dimm_status::TRAINING_FAILURE, "has training failure; "},
141 {dimm_status::PMIC_TEMP_ALERT, "has PMIC temperature alert"}};
142
143 /*
144 A map between PHY training failure syndrome and logging strings.
145 Using
146 pldm::oem::dimm::training_faillure::phy_syndrome::phy_training_failure_syndrome
147 */
148 EventToMsgMap_t phyTrainingFailureSyndromeToMsgMap = {
149 {phy_syndrome::NA, "(N/A)"},
150 {phy_syndrome::PHY_TRAINING_SETUP_FAILURE, "(PHY training setup failure)"},
151 {phy_syndrome::CA_LEVELING, "(CA leveling)"},
152 {phy_syndrome::PHY_WRITE_LEVEL_FAILURE,
153 "(PHY write level failure - see syndrome 1)"},
154 {phy_syndrome::PHY_READ_GATE_LEVELING_FAILURE,
155 "(PHY read gate leveling failure)"},
156 {phy_syndrome::PHY_READ_LEVEL_FAILURE, "(PHY read level failure)"},
157 {phy_syndrome::WRITE_DQ_LEVELING, "(Write DQ leveling)"},
158 {phy_syndrome::PHY_SW_TRAINING_FAILURE, "(PHY SW training failure)"}};
159
160 /*
161 A map between DIMM training failure syndrome and logging strings.
162 Using
163 pldm::oem::dimm::training_faillure::dimm_syndrome::dimm_training_failure_syndrome
164 */
165 EventToMsgMap_t dimmTrainingFailureSyndromeToMsgMap = {
166 {dimm_syndrome::NA, "(N/A)"},
167 {dimm_syndrome::DRAM_VREFDQ_TRAINING_FAILURE,
168 "(DRAM VREFDQ training failure)"},
169 {dimm_syndrome::LRDIMM_DB_TRAINING_FAILURE, "(LRDIMM DB training failure)"},
170 {dimm_syndrome::LRDRIMM_DB_SW_TRAINING_FAILURE,
171 "(LRDRIMM DB SW training failure)"}};
172
173 /*
174 A map between DIMM training failure type and a pair of <logging strings -
175 syndrome map>. Using
176 pldm::oem::dimm::training_faillure::dimm_training_failure_type
177 */
178 std::unordered_map<uint8_t, std::pair<std::string, EventToMsgMap_t>>
179 dimmTrainingFailureTypeMap = {
180 {training_failure::PHY_TRAINING_FAILURE_TYPE,
181 std::make_pair("PHY training failure",
182 phyTrainingFailureSyndromeToMsgMap)},
183 {training_failure::DIMM_TRAINING_FAILURE_TYPE,
184 std::make_pair("DIMM training failure",
185 dimmTrainingFailureSyndromeToMsgMap)}};
186
187 /*
188 A map between log level and the registry used for Redfish SEL log
189 Using pldm::oem::log_level
190 */
191 std::unordered_map<log_level, std::string> logLevelToRedfishMsgIdMap = {
192 {log_level::OK, ampereEventRegistry},
193 {log_level::WARNING, ampereWarningRegistry},
194 {log_level::CRITICAL, ampereCriticalRegistry},
195 {log_level::BIOSFWPANIC, BIOSFWPanicRegistry}};
196
197 std::unordered_map<
198 uint16_t,
199 std::vector<std::pair<
200 std::string,
201 std::unordered_map<uint8_t, std::pair<log_level, std::string>>>>>
202 stateSensorToMsgMap = {
203 {SOC_HEALTH_AVAILABILITY,
204 {{"SoC Health",
205 {{1, {log_level::OK, "Normal"}},
206 {2, {log_level::WARNING, "Non-Critical"}},
207 {3, {log_level::CRITICAL, "Critical"}},
208 {4, {log_level::CRITICAL, "Fatal"}}}},
209 {"SoC Availability",
210 {{1, {log_level::OK, "Enabled"}},
211 {2, {log_level::WARNING, "Disabled"}},
212 {3, {log_level::CRITICAL, "Shutdown"}}}}}},
213 {WATCH_DOG,
214 {{"Global Watch Dog",
215 {{1, {log_level::OK, "Normal"}},
216 {2, {log_level::CRITICAL, "Timer Expired"}}}},
217 {"Secure Watch Dog",
218 {{1, {log_level::OK, "Normal"}},
219 {2, {log_level::CRITICAL, "Timer Expired"}}}},
220 {"Non-secure Watch Dog",
221 {{1, {log_level::OK, "Normal"}},
222 {2, {log_level::CRITICAL, "Timer Expired"}}}}}}};
223
prefixMsgStrCreation(pldm_tid_t tid,uint16_t sensorId)224 std::string OemEventManager::prefixMsgStrCreation(pldm_tid_t tid,
225 uint16_t sensorId)
226 {
227 std::string description;
228 if (!tidToSocketNameMap.contains(tid))
229 {
230 description += "TID " + std::to_string(tid) + ": ";
231 }
232 else
233 {
234 description += tidToSocketNameMap[tid] + ": ";
235 }
236
237 if (!sensorIdToStrMap.contains(sensorId))
238 {
239 description += "Sensor ID " + std::to_string(sensorId) + ": ";
240 }
241 else
242 {
243 description += sensorIdToStrMap[sensorId] + ": ";
244 }
245
246 return description;
247 }
248
sendJournalRedfish(const std::string & description,log_level & logLevel)249 void OemEventManager::sendJournalRedfish(const std::string& description,
250 log_level& logLevel)
251 {
252 if (description.empty())
253 {
254 return;
255 }
256
257 if (!logLevelToRedfishMsgIdMap.contains(logLevel))
258 {
259 lg2::error("Invalid {LEVEL} Description {DES}", "LEVEL", logLevel,
260 "DES", description);
261 return;
262 }
263 auto redfishMsgId = logLevelToRedfishMsgIdMap[logLevel];
264 lg2::info("MESSAGE={DES}", "DES", description, "REDFISH_MESSAGE_ID",
265 redfishMsgId, "REDFISH_MESSAGE_ARGS", description);
266 }
267
dimmIdxsToString(uint32_t dimmIdxs)268 std::string OemEventManager::dimmIdxsToString(uint32_t dimmIdxs)
269 {
270 std::string description;
271 for (const auto bitIdx : std::views::iota(0, maxDIMMIdxBitNum))
272 {
273 if (dimmIdxs & (static_cast<uint32_t>(1) << bitIdx))
274 {
275 description += " #" + std::to_string(bitIdx);
276 }
277 }
278 return description;
279 }
280
sensorIdToDIMMIdx(const uint16_t & sensorId)281 uint8_t OemEventManager::sensorIdToDIMMIdx(const uint16_t& sensorId)
282 {
283 uint8_t dimmIdx = maxDIMMInstantNum;
284 int sensorId_Off = sensorId - 4;
285 if ((sensorId_Off >= 0) && ((sensorId_Off % 2) == 0) &&
286 ((sensorId_Off / 2) < maxDIMMInstantNum))
287 {
288 dimmIdx = sensorId_Off / 2;
289 }
290 return dimmIdx;
291 }
292
handleBootOverallEvent(pldm_tid_t,uint16_t,uint32_t presentReading)293 void OemEventManager::handleBootOverallEvent(
294 pldm_tid_t /*tid*/, uint16_t /*sensorId*/, uint32_t presentReading)
295 {
296 log_level logLevel{log_level::OK};
297 std::string description;
298 std::stringstream strStream;
299
300 uint8_t byte0 = (presentReading & 0x000000ff);
301 uint8_t byte1 = (presentReading & 0x0000ff00) >> 8;
302 uint8_t byte2 = (presentReading & 0x00ff0000) >> 16;
303 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
304 /*
305 * Handle SECpro, Mpro, ATF BL1, ATF BL2, ATF BL31,
306 * ATF BL32 and DDR initialization
307 */
308 if (bootStageToMsgMap.contains(byte3))
309 {
310 // Boot stage adding
311 description += bootStageToMsgMap[byte3];
312
313 switch (byte3)
314 {
315 case boot_stage::DDR_TRAINING:
316 if (byte0 >= ddrTrainingMsg.size())
317 {
318 logLevel = log_level::BIOSFWPANIC;
319 description += " unknown status";
320 }
321 else
322 {
323 description += ddrTrainingMsg[byte0];
324 }
325 if (0x01 == byte0)
326 {
327 // Add complete percentage
328 description += " at " + std::to_string(byte1) + "%";
329 }
330 break;
331 case boot_stage::S0_DDR_TRAINING_FAILURE:
332 case boot_stage::S1_DDR_TRAINING_FAILURE:
333 // ddr_training_status_msg()
334 logLevel = log_level::BIOSFWPANIC;
335 description += " at DIMMs:";
336 // dimmIdxs = presentReading & 0x00ffffff;
337 description += dimmIdxsToString(presentReading & 0x00ffffff);
338 description += " of socket ";
339 description +=
340 (boot_stage::S0_DDR_TRAINING_FAILURE == byte3) ? "0" : "1";
341 break;
342 default:
343 if (byte0 >= bootStatMsg.size())
344 {
345 logLevel = log_level::BIOSFWPANIC;
346 description += " unknown status";
347 }
348 else
349 {
350 description += bootStatMsg[byte0];
351 }
352 break;
353 }
354
355 // Sensor report action is fail
356 if (boot::status::BOOT_STATUS_FAILURE == byte2)
357 {
358 logLevel = log_level::BIOSFWPANIC;
359 }
360 }
361 else
362 {
363 if (byte3 <= boot_stage::UEFI_STATUS_CLASS_CODE_MAX)
364 {
365 description +=
366 bootStageToMsgMap[boot_stage::UEFI_STATUS_CLASS_CODE_MIN];
367
368 strStream
369 << "Segment (0x" << std::setfill('0') << std::hex
370 << std::setw(8) << static_cast<uint32_t>(presentReading)
371 << "); Status Class (0x" << std::setw(2)
372 << static_cast<uint32_t>(byte3) << "); Status SubClass (0x"
373 << std::setw(2) << static_cast<uint32_t>(byte2)
374 << "); Operation Code (0x" << std::setw(4)
375 << static_cast<uint32_t>((presentReading & 0xffff0000) >> 16)
376 << ")" << std::dec;
377
378 description += strStream.str();
379 }
380 }
381
382 // Log to Redfish event
383 sendJournalRedfish(description, logLevel);
384 }
385
processNumericSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)386 int OemEventManager::processNumericSensorEvent(
387 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
388 size_t sensorDataLength)
389 {
390 uint8_t eventState = 0;
391 uint8_t previousEventState = 0;
392 uint8_t sensorDataSize = 0;
393 uint32_t presentReading;
394 auto rc = decode_numeric_sensor_data(
395 sensorData, sensorDataLength, &eventState, &previousEventState,
396 &sensorDataSize, &presentReading);
397 if (rc)
398 {
399 lg2::error(
400 "Failed to decode numericSensorState event for terminus ID {TID}, error {RC} ",
401 "TID", tid, "RC", rc);
402 return rc;
403 }
404
405 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
406 if (auto dimmIdx = sensorIdToDIMMIdx(sensorId); dimmIdx < maxDIMMInstantNum)
407 {
408 handleDIMMStatusEvent(tid, sensorId, presentReading);
409 return PLDM_SUCCESS;
410 }
411
412 switch (sensorId)
413 {
414 case BOOT_OVERALL:
415 handleBootOverallEvent(tid, sensorId, presentReading);
416 break;
417 case PCIE_HOT_PLUG:
418 handlePCIeHotPlugEvent(tid, sensorId, presentReading);
419 break;
420 case DDR_STATUS:
421 handleDDRStatusEvent(tid, sensorId, presentReading);
422 break;
423 case PCP_VR_STATE:
424 case SOC_VR_STATE:
425 case DPHY_VR1_STATE:
426 case DPHY_VR2_STATE:
427 case D2D_VR_STATE:
428 case IOC_VR1_STATE:
429 case IOC_VR2_STATE:
430 case PCI_D_VR_STATE:
431 case PCI_A_VR_STATE:
432 handleVRDStatusEvent(tid, sensorId, presentReading);
433 break;
434 case WATCH_DOG:
435 handleNumericWatchdogEvent(tid, sensorId, presentReading);
436 break;
437 default:
438 std::string description;
439 std::stringstream strStream;
440
441 description += "SENSOR_EVENT : NUMERIC_SENSOR_STATE: ";
442 description += prefixMsgStrCreation(tid, sensorId);
443 strStream << std::setfill('0') << std::hex << "eventState 0x"
444 << std::setw(2) << static_cast<uint32_t>(eventState)
445 << " previousEventState 0x" << std::setw(2)
446 << static_cast<uint32_t>(previousEventState)
447 << " sensorDataSize 0x" << std::setw(2)
448 << static_cast<uint32_t>(sensorDataSize)
449 << " presentReading 0x" << std::setw(8)
450 << static_cast<uint32_t>(presentReading) << std::dec;
451 description += strStream.str();
452 std::cout << description << "\n";
453 }
454 return PLDM_SUCCESS;
455 }
456
processStateSensorEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)457 int OemEventManager::processStateSensorEvent(pldm_tid_t tid, uint16_t sensorId,
458 const uint8_t* sensorData,
459 size_t sensorDataLength)
460 {
461 uint8_t sensorOffset = 0;
462 uint8_t eventState = 0;
463 uint8_t previousEventState = 0;
464
465 auto rc =
466 decode_state_sensor_data(sensorData, sensorDataLength, &sensorOffset,
467 &eventState, &previousEventState);
468 if (rc)
469 {
470 lg2::error(
471 "Failed to decode stateSensorState event for terminus ID {TID}, error {RC}",
472 "TID", tid, "RC", rc);
473 return rc;
474 }
475
476 std::string description;
477
478 if (stateSensorToMsgMap.contains(sensorId))
479 {
480 log_level logLevel = log_level::OK;
481
482 description += prefixMsgStrCreation(tid, sensorId);
483 auto componentMap = stateSensorToMsgMap[sensorId];
484 if (sensorOffset < componentMap.size())
485 {
486 description += std::get<0>(componentMap[sensorOffset]);
487 auto stateMap = std::get<1>(componentMap[sensorOffset]);
488 if (stateMap.contains(eventState))
489 {
490 logLevel = std::get<0>(stateMap[eventState]);
491 description += " state : " + std::get<1>(stateMap[eventState]);
492 if (stateMap.contains(previousEventState))
493 {
494 description += "; previous state: " +
495 std::get<1>(stateMap[previousEventState]);
496 }
497 }
498 else
499 {
500 description += " sends unsupported event state: " +
501 std::to_string(eventState);
502 if (stateMap.contains(previousEventState))
503 {
504 description += "; previous state: " +
505 std::get<1>(stateMap[previousEventState]);
506 }
507 }
508 }
509 else
510 {
511 description += "sends unsupported component sensor offset " +
512 std::to_string(sensorOffset);
513 }
514
515 sendJournalRedfish(description, logLevel);
516 }
517 else
518 {
519 std::stringstream strStream;
520 description += "SENSOR_EVENT : STATE_SENSOR_STATE: ";
521 description += prefixMsgStrCreation(tid, sensorId);
522 strStream << std::setfill('0') << std::hex << "sensorOffset 0x"
523 << std::setw(2) << static_cast<uint32_t>(sensorOffset)
524 << "eventState 0x" << std::setw(2)
525 << static_cast<uint32_t>(eventState)
526 << " previousEventState 0x" << std::setw(2)
527 << static_cast<uint32_t>(previousEventState) << std::dec;
528 description += strStream.str();
529 std::cout << description << "\n";
530 }
531
532 return PLDM_SUCCESS;
533 }
534
processSensorOpStateEvent(pldm_tid_t tid,uint16_t sensorId,const uint8_t * sensorData,size_t sensorDataLength)535 int OemEventManager::processSensorOpStateEvent(
536 pldm_tid_t tid, uint16_t sensorId, const uint8_t* sensorData,
537 size_t sensorDataLength)
538 {
539 uint8_t present_op_state = 0;
540 uint8_t previous_op_state = 0;
541
542 auto rc = decode_sensor_op_data(sensorData, sensorDataLength,
543 &present_op_state, &previous_op_state);
544 if (rc)
545 {
546 lg2::error(
547 "Failed to decode sensorOpState event for terminus ID {TID}, error {RC}",
548 "TID", tid, "RC", rc);
549 return rc;
550 }
551
552 std::string description;
553 std::stringstream strStream;
554
555 description += "SENSOR_EVENT : SENSOR_OP_STATE: ";
556 description += prefixMsgStrCreation(tid, sensorId);
557 strStream << std::setfill('0') << std::hex << "present_op_state 0x"
558 << std::setw(2) << static_cast<uint32_t>(present_op_state)
559 << "previous_op_state 0x" << std::setw(2)
560 << static_cast<uint32_t>(previous_op_state) << std::dec;
561 description += strStream.str();
562 std::cout << description << "\n";
563
564 return PLDM_SUCCESS;
565 }
566
handleSensorEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)567 int OemEventManager::handleSensorEvent(
568 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
569 pldm_tid_t tid, size_t eventDataOffset)
570 {
571 /* This OEM event handler is only used for SoC terminus*/
572 if (!tidToSocketNameMap.contains(tid))
573 {
574 return PLDM_SUCCESS;
575 }
576 auto eventData =
577 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
578 auto eventDataSize = payloadLength - eventDataOffset;
579
580 uint16_t sensorId = 0;
581 uint8_t sensorEventClassType = 0;
582 size_t eventClassDataOffset = 0;
583 auto rc =
584 decode_sensor_event_data(eventData, eventDataSize, &sensorId,
585 &sensorEventClassType, &eventClassDataOffset);
586 if (rc)
587 {
588 lg2::error("Failed to decode sensor event data return code {RC}.", "RC",
589 rc);
590 return rc;
591 }
592 const uint8_t* sensorData = eventData + eventClassDataOffset;
593 size_t sensorDataLength = eventDataSize - eventClassDataOffset;
594
595 switch (sensorEventClassType)
596 {
597 case PLDM_NUMERIC_SENSOR_STATE:
598 {
599 return processNumericSensorEvent(tid, sensorId, sensorData,
600 sensorDataLength);
601 }
602 case PLDM_STATE_SENSOR_STATE:
603 {
604 return processStateSensorEvent(tid, sensorId, sensorData,
605 sensorDataLength);
606 }
607 case PLDM_SENSOR_OP_STATE:
608 {
609 return processSensorOpStateEvent(tid, sensorId, sensorData,
610 sensorDataLength);
611 }
612 default:
613 std::string description;
614 std::stringstream strStream;
615
616 description += "SENSOR_EVENT : Unsupported Sensor Class " +
617 std::to_string(sensorEventClassType) + ": ";
618 description += prefixMsgStrCreation(tid, sensorId);
619 strStream << std::setfill('0') << std::hex
620 << std::setw(sizeof(sensorData) * 2) << "Sensor data: ";
621
622 auto dataPtr = sensorData;
623 for ([[maybe_unused]] const auto& i :
624 std::views::iota(0, (int)sensorDataLength))
625 {
626 strStream << "0x" << static_cast<uint32_t>(*dataPtr);
627 dataPtr += sizeof(sensorData);
628 }
629
630 description += strStream.str();
631 std::cout << description << "\n";
632 }
633
634 return PLDM_ERROR;
635 }
636
handlePCIeHotPlugEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)637 void OemEventManager::handlePCIeHotPlugEvent(pldm_tid_t tid, uint16_t sensorId,
638 uint32_t presentReading)
639 {
640 std::string description;
641 std::stringstream strStream;
642 PCIeHotPlugEventRecord_t record{presentReading};
643
644 std::string sAction = (!record.bits.action) ? "Insertion" : "Removal";
645 std::string sOpStatus = (!record.bits.opStatus) ? "Successful" : "Failed";
646 log_level logLevel =
647 (!record.bits.opStatus) ? log_level::OK : log_level::WARNING;
648
649 description += prefixMsgStrCreation(tid, sensorId);
650
651 strStream << "Segment (0x" << std::setfill('0') << std::hex << std::setw(2)
652 << static_cast<uint32_t>(record.bits.segment) << "); Bus (0x"
653 << std::setw(2) << static_cast<uint32_t>(record.bits.bus)
654 << "); Device (0x" << std::setw(2)
655 << static_cast<uint32_t>(record.bits.device) << "); Function (0x"
656 << std::setw(2) << static_cast<uint32_t>(record.bits.function)
657 << "); Action (" << sAction << "); Operation status ("
658 << sOpStatus << "); Media slot number (" << std::dec
659 << static_cast<uint32_t>(record.bits.mediaSlot) << ")";
660
661 description += strStream.str();
662
663 // Log to Redfish event
664 sendJournalRedfish(description, logLevel);
665 }
666
dimmTrainingFailureToMsg(uint32_t failureInfo)667 std::string OemEventManager::dimmTrainingFailureToMsg(uint32_t failureInfo)
668 {
669 std::string description;
670 DIMMTrainingFailure_t failure{failureInfo};
671
672 if (dimmTrainingFailureTypeMap.contains(failure.bits.type))
673 {
674 auto failureInfoMap = dimmTrainingFailureTypeMap[failure.bits.type];
675
676 description += std::get<0>(failureInfoMap);
677
678 description += "; MCU rank index " +
679 std::to_string(failure.bits.mcuRankIdx);
680
681 description += "; Slice number " +
682 std::to_string(failure.bits.sliceNum);
683
684 description += "; Upper nibble error status: ";
685 description += (!failure.bits.upperNibbStatErr)
686 ? "No error"
687 : "Found no rising edge";
688
689 description += "; Lower nibble error status: ";
690 description += (!failure.bits.lowerNibbStatErr)
691 ? "No error"
692 : "Found no rising edge";
693
694 description += "; Failure syndrome 0: ";
695
696 auto& syndromeMap = std::get<1>(failureInfoMap);
697 if (syndromeMap.contains(failure.bits.syndrome))
698 {
699 description += syndromeMap[failure.bits.syndrome];
700 }
701 else
702 {
703 description += "(Unknown syndrome)";
704 }
705 }
706 else
707 {
708 description += "Unknown training failure type " +
709 std::to_string(failure.bits.type);
710 }
711
712 return description;
713 }
714
handleDIMMStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)715 void OemEventManager::handleDIMMStatusEvent(pldm_tid_t tid, uint16_t sensorId,
716 uint32_t presentReading)
717 {
718 log_level logLevel{log_level::WARNING};
719 std::string description;
720 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
721 uint32_t byte012 = presentReading & 0xffffff;
722
723 description += prefixMsgStrCreation(tid, sensorId);
724
725 // DIMMx_Status sensorID 4+2*index (index 0 -> maxDIMMInstantNum-1)
726 auto dimmIdx = sensorIdToDIMMIdx(sensorId);
727 if (dimmIdx >= maxDIMMIdxBitNum)
728 {
729 return;
730 }
731
732 description += "DIMM " + std::to_string(dimmIdx) + " ";
733
734 if (dimmStatusToMsgMap.contains(byte3))
735 {
736 if (byte3 == dimm_status::INSTALLED_NO_ERROR ||
737 byte3 == dimm_status::INSTALLED_BUT_DISABLED)
738 {
739 logLevel = log_level::OK;
740 }
741
742 description += dimmStatusToMsgMap[byte3];
743
744 if (byte3 == dimm_status::TRAINING_FAILURE)
745 {
746 description += "; " + dimmTrainingFailureToMsg(byte012);
747 }
748 else if (byte3 == dimm_status::PMIC_TEMP_ALERT)
749 {
750 uint8_t byte0 = (byte012 & 0xff);
751 if (byte0 < pmicTempAlertMsg.size())
752 {
753 description += ": " + pmicTempAlertMsg[byte0];
754 }
755 }
756 }
757 else
758 {
759 switch (byte3)
760 {
761 case dimm_status::PMIC_HIGH_TEMP:
762 if (byte012 == 0x01)
763 {
764 description += "has PMIC high temp condition";
765 }
766 break;
767 case dimm_status::TSx_HIGH_TEMP:
768 switch (byte012)
769 {
770 case 0x01:
771 description += "has TS0";
772 break;
773 case 0x02:
774 description += "has TS1";
775 break;
776 case 0x03:
777 description += "has TS0 and TS1";
778 break;
779 }
780 description += " exceeding their high temperature threshold";
781 break;
782 case dimm_status::SPD_HUB_HIGH_TEMP:
783 if (byte012 == 0x01)
784 {
785 description += "has SPD/HUB high temp condition";
786 }
787 break;
788 default:
789 description += "has unsupported status " +
790 std::to_string(byte3);
791 break;
792 }
793 }
794
795 // Log to Redfish event
796 sendJournalRedfish(description, logLevel);
797 }
798
handleDDRStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)799 void OemEventManager::handleDDRStatusEvent(pldm_tid_t tid, uint16_t sensorId,
800 uint32_t presentReading)
801 {
802 log_level logLevel{log_level::WARNING};
803 std::string description;
804 uint8_t byte3 = (presentReading & 0xff000000) >> 24;
805 uint32_t byte012 = presentReading & 0xffffff;
806
807 description += prefixMsgStrCreation(tid, sensorId);
808
809 description += "DDR ";
810 if (ddrStatusToMsgMap.contains(byte3))
811 {
812 if (byte3 == ddr_status::NO_SYSTEM_LEVEL_ERROR)
813 {
814 logLevel = log_level::OK;
815 }
816
817 description += ddrStatusToMsgMap[byte3];
818
819 if (byte3 == ddr_status::CONFIGURATION_FAILURE ||
820 byte3 == ddr_status::TRAINING_FAILURE)
821 {
822 // List out failed DIMMs
823 description += dimmIdxsToString(byte012);
824 }
825 }
826 else
827 {
828 description += "has unsupported status " + std::to_string(byte3);
829 }
830
831 // Log to Redfish event
832 sendJournalRedfish(description, logLevel);
833 }
834
handleVRDStatusEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)835 void OemEventManager::handleVRDStatusEvent(pldm_tid_t tid, uint16_t sensorId,
836 uint32_t presentReading)
837 {
838 log_level logLevel{log_level::WARNING};
839 std::string description;
840 std::stringstream strStream;
841
842 description += prefixMsgStrCreation(tid, sensorId);
843
844 VRDStatus_t status{presentReading};
845
846 if (status.bits.warning && status.bits.critical)
847 {
848 description += "A VR warning and a VR critical";
849 logLevel = log_level::CRITICAL;
850 }
851 else
852 {
853 if (status.bits.warning)
854 {
855 description += "A VR warning";
856 }
857 else if (status.bits.critical)
858 {
859 description += "A VR critical";
860 logLevel = log_level::CRITICAL;
861 }
862 else
863 {
864 description += "No VR warning or critical";
865 logLevel = log_level::OK;
866 }
867 }
868 description += " condition observed";
869
870 strStream << "; VR status byte high is 0x" << std::setfill('0') << std::hex
871 << std::setw(2)
872 << static_cast<uint32_t>(status.bits.vr_status_byte_high)
873 << "; VR status byte low is 0x" << std::setw(2)
874 << static_cast<uint32_t>(status.bits.vr_status_byte_low)
875 << "; Reading is 0x" << std::setw(2)
876 << static_cast<uint32_t>(presentReading) << ";";
877
878 description += strStream.str();
879
880 // Log to Redfish event
881 sendJournalRedfish(description, logLevel);
882 }
883
handleNumericWatchdogEvent(pldm_tid_t tid,uint16_t sensorId,uint32_t presentReading)884 void OemEventManager::handleNumericWatchdogEvent(
885 pldm_tid_t tid, uint16_t sensorId, uint32_t presentReading)
886 {
887 std::string description;
888 log_level logLevel = log_level::CRITICAL;
889
890 description += prefixMsgStrCreation(tid, sensorId);
891
892 if (presentReading & 0x01)
893 {
894 description += "Global watchdog expired;";
895 }
896 if (presentReading & 0x02)
897 {
898 description += "Secure watchdog expired;";
899 }
900 if (presentReading & 0x04)
901 {
902 description += "Non-secure watchdog expired;";
903 }
904
905 // Log to Redfish event
906 sendJournalRedfish(description, logLevel);
907 }
908
processOemMsgPollEvent(pldm_tid_t tid,uint16_t eventId,const uint8_t * eventData,size_t eventDataSize)909 int OemEventManager::processOemMsgPollEvent(pldm_tid_t tid, uint16_t eventId,
910 const uint8_t* eventData,
911 size_t eventDataSize)
912 {
913 EFI_AMPERE_ERROR_DATA ampHdr;
914
915 decodeCperRecord(eventData, eventDataSize, &Hdr);
916
917 addCperSELLog(tid, eventId, &Hdr);
918
919 /* isBert at bit 12 of TypeId */
920 if (ampHdr.TypeId & 0x0800)
921 {
922 lg2::info("Ampere SoC BERT is triggered.");
923 std::variant<std::string> value(
924 "com.ampere.CrashCapture.Trigger.TriggerAction.Bert");
925 try
926 {
927 auto& bus = pldm::utils::DBusHandler::getBus();
928 auto method =
929 bus.new_method_call("com.ampere.CrashCapture.Trigger",
930 "/com/ampere/crashcapture/trigger",
931 pldm::utils::dbusProperties, "Set");
932 method.append("com.ampere.CrashCapture.Trigger", "TriggerActions",
933 value);
934 bus.call_noreply(method);
935 }
936 catch (const std::exception& e)
937 {
938 lg2::error("call BERT trigger error - {ERROR}", "ERROR", e);
939 }
940 }
941
942 return PLDM_SUCCESS;
943 }
944
handlepldmMessagePollEvent(const pldm_msg * request,size_t payloadLength,uint8_t,pldm_tid_t tid,size_t eventDataOffset)945 int OemEventManager::handlepldmMessagePollEvent(
946 const pldm_msg* request, size_t payloadLength, uint8_t /* formatVersion */,
947 pldm_tid_t tid, size_t eventDataOffset)
948 {
949 /* This OEM event handler is only used for SoC terminus*/
950 if (!tidToSocketNameMap.contains(tid))
951 {
952 return PLDM_SUCCESS;
953 }
954
955 auto eventData =
956 reinterpret_cast<const uint8_t*>(request->payload) + eventDataOffset;
957 auto eventDataSize = payloadLength - eventDataOffset;
958
959 pldm_message_poll_event poll_event{};
960 auto rc = decode_pldm_message_poll_event_data(eventData, eventDataSize,
961 &poll_event);
962 if (rc)
963 {
964 lg2::error("Failed to decode PldmMessagePollEvent event, error {RC} ",
965 "RC", rc);
966 return rc;
967 }
968
969 auto sensorID = poll_event.event_id;
970 /* The UE errors */
971 if (rasUESensorIDs.contains(sensorID))
972 {
973 pldm::utils::DBusMapping dbusMapping{
974 "/xyz/openbmc_project/led/groups/ras_ue_fault",
975 "xyz.openbmc_project.Led.Group", "Asserted", "bool"};
976 try
977 {
978 pldm::utils::DBusHandler().setDbusProperty(
979 dbusMapping, pldm::utils::PropertyValue{bool(true)});
980 }
981 catch (const std::exception& e)
982 {
983 lg2::error(
984 "Failed to set the RAS UE LED terminus ID {TID} sensor ID {SENSORID} - errors {ERROR}",
985 "TID", tid, "SENSORID", sensorID, "ERROR", e);
986 }
987 }
988
989 return PLDM_SUCCESS;
990 }
991
oemPollForPlatformEvent(pldm_tid_t tid)992 exec::task<int> OemEventManager::oemPollForPlatformEvent(pldm_tid_t tid)
993 {
994 uint64_t t0 = 0;
995
996 /* This OEM event handler is only used for SoC terminus */
997 if (!tidToSocketNameMap.contains(tid))
998 {
999 co_return PLDM_SUCCESS;
1000 }
1001
1002 if (!timeStampMap.contains(tid))
1003 {
1004 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1005 timeStampMap.emplace(std::make_pair(tid, t0));
1006 }
1007 else
1008 {
1009 sd_event_now(event.get(), CLOCK_MONOTONIC, &t0);
1010 uint64_t elapsed = t0 - timeStampMap[tid];
1011 if (elapsed >= NORMAL_EVENT_POLLING_TIME)
1012 {
1013 co_await manager->pollForPlatformEvent(tid, 0, 0);
1014 timeStampMap[tid] = t0;
1015 }
1016 }
1017
1018 co_return PLDM_SUCCESS;
1019 }
1020 } // namespace oem_ampere
1021 } // namespace pldm
1022