1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <map>
28 #include <stdexcept>
29
30 namespace openpower::dump::sbe_chipop
31 {
32
33 using namespace phosphor::logging;
34 using namespace openpower::dump::SBE;
35 using namespace openpower::phal::dump;
36 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
37
collectDump(uint8_t type,uint32_t id,uint32_t failingUnit,const std::filesystem::path & path)38 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
39 uint32_t failingUnit,
40 const std::filesystem::path& path)
41 {
42 if ((type == SBE_DUMP_TYPE_SBE) || (type == SBE_DUMP_TYPE_MSBE))
43 {
44 collectSBEDump(id, failingUnit, path, type);
45 return;
46 }
47 collectHWHBDump(type, id, failingUnit, path);
48 }
49
collectHWHBDump(uint8_t type,uint32_t id,uint64_t failingUnit,const std::filesystem::path & path)50 void SbeDumpCollector::collectHWHBDump(uint8_t type, uint32_t id,
51 uint64_t failingUnit,
52 const std::filesystem::path& path)
53 {
54 lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
55 "failingUnit:{FAILINGUNIT}, path:{PATH}",
56 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
57 path.string());
58
59 initializePdbg();
60
61 TargetMap targets;
62
63 struct pdbg_target* target = nullptr;
64 pdbg_for_each_class_target("proc", target)
65 {
66 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
67 !openpower::phal::pdbg::isTgtFunctional(target))
68 {
69 continue;
70 }
71
72 bool includeTarget = true;
73 // if the dump type is hostboot then call stop instructions
74 if (type == SBE_DUMP_TYPE_HOSTBOOT)
75 {
76 includeTarget = executeThreadStop(target, path);
77 }
78 if (includeTarget)
79 {
80 targets[target] = std::vector<struct pdbg_target*>();
81
82 // Hardware dump needs OCMB data if present
83 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
84 {
85 struct pdbg_target* ocmbTarget;
86 pdbg_for_each_target("ocmb", target, ocmbTarget)
87 {
88 if (!is_ody_ocmb_chip(ocmbTarget))
89 {
90 continue;
91 }
92
93 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
94 {
95 continue;
96 }
97
98 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
99 {
100 continue;
101 }
102 targets[target].push_back(ocmbTarget);
103 }
104 }
105 }
106 }
107
108 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
109 for (auto cstate : clockStates)
110 {
111 // Skip collection for performance dump if clock state is not ON
112 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
113 {
114 continue;
115 }
116 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
117 cstate, targets);
118
119 // Wait for all asynchronous tasks to complete
120 for (auto& future : futures)
121 {
122 try
123 {
124 future.wait();
125 }
126 catch (const std::exception& e)
127 {
128 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
129 "ERROR", e);
130 }
131 }
132 lg2::info(
133 "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
134 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
135 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
136 failingUnit, "PATH", path.string());
137 }
138 if (std::filesystem::is_empty(path))
139 {
140 lg2::error("Failed to collect the dump");
141 throw std::runtime_error("Failed to collect the dump");
142 }
143 lg2::info("Dump collection completed");
144 }
145
collectSBEDump(uint32_t id,uint32_t failingUnit,const std::filesystem::path & dumpPath,const int sbeTypeId)146 void SbeDumpCollector::collectSBEDump(uint32_t id, uint32_t failingUnit,
147 const std::filesystem::path& dumpPath,
148 const int sbeTypeId)
149 {
150 lg2::info("Collecting SBE dump: path={PATH}, id={ID}, "
151 "chip position={FAILINGUNIT}",
152 "PATH", dumpPath.string().c_str(), "ID", id, "FAILINGUNIT",
153 failingUnit);
154
155 struct pdbg_target* proc_ody = nullptr;
156 struct pdbg_target* pibFsiTarget = nullptr;
157 std::string sbeChipType;
158
159 try
160 {
161 // Execute pre-collection steps and get the proc target
162 initializePdbgLibEkb();
163
164 proc_ody = getTargetFromFailingId(failingUnit, sbeTypeId);
165 if (PROC_SBE_DUMP == sbeTypeId)
166 {
167 pibFsiTarget = probeTarget(proc_ody, "pib", sbeTypeId);
168 sbeChipType = "_p10_";
169 }
170 else
171 {
172 pibFsiTarget = probeTarget(proc_ody, "fsi", sbeTypeId);
173 sbeChipType = "_ody_";
174 }
175 }
176 catch (const std::exception& e)
177 {
178 lg2::error("Failed to collect the SBE dump: {ERROR}", "ERROR",
179 e.what());
180 throw;
181 }
182
183 std::stringstream ss;
184 ss << std::setw(8) << std::setfill('0') << id;
185
186 std::string baseFilename = ss.str() + ".0_" + std::to_string(failingUnit) +
187 "_SbeData" + sbeChipType;
188
189 try
190 {
191 checkSbeState(pibFsiTarget, sbeTypeId);
192
193 executeSbeExtractRc(proc_ody, dumpPath, sbeTypeId);
194
195 // Collect various dumps
196 collectLocalRegDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
197 collectPIBMSRegDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
198 collectPIBMEMDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
199 collectPPEState(proc_ody, dumpPath, baseFilename, sbeTypeId);
200
201 // Finalize the collection process and indicate successful completion
202 finalizeCollection(pibFsiTarget, dumpPath, true, sbeTypeId);
203
204 lg2::info("SBE dump collection completed successfully");
205 }
206 catch (const std::exception& e)
207 {
208 lg2::error("Failed to collect the SBE dump: {ERROR}", "ERROR",
209 e.what());
210 // In case of any exception, attempt to finalize with a failure
211 // state
212 if (proc_ody)
213 finalizeCollection(pibFsiTarget, dumpPath, false, sbeTypeId);
214 throw;
215 }
216 }
217
initializePdbg()218 void SbeDumpCollector::initializePdbg()
219 {
220 openpower::phal::pdbg::init();
221 }
222
spawnDumpCollectionProcesses(uint8_t type,uint32_t id,const std::filesystem::path & path,uint64_t failingUnit,uint8_t cstate,const TargetMap & targetMap)223 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
224 uint8_t type, uint32_t id, const std::filesystem::path& path,
225 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
226 {
227 std::vector<std::future<void>> futures;
228
229 for (const auto& [procTarget, ocmbTargets] : targetMap)
230 {
231 auto future = std::async(std::launch::async, [this, procTarget,
232 ocmbTargets, path, id,
233 type, cstate,
234 failingUnit]() {
235 try
236 {
237 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
238 failingUnit);
239 }
240 catch (const std::exception& e)
241 {
242 lg2::error(
243 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
244 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
245 }
246
247 // Collect OCMBs only with clock on
248 if (cstate == SBE_CLOCK_ON)
249 {
250 // Handle OCMBs serially after handling the proc
251 for (auto ocmbTarget : ocmbTargets)
252 {
253 try
254 {
255 this->collectDumpFromSBE(ocmbTarget, path, id, type,
256 cstate, failingUnit);
257 }
258 catch (const std::exception& e)
259 {
260 lg2::error(
261 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
262 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
263 e);
264 }
265 }
266 }
267 });
268
269 futures.push_back(std::move(future));
270 }
271
272 return futures;
273 }
274
logErrorAndCreatePEL(const openpower::phal::sbeError_t & sbeError,uint64_t chipPos,SBETypes sbeType,uint32_t cmdClass,uint32_t cmdType,const std::filesystem::path & path)275 bool SbeDumpCollector::logErrorAndCreatePEL(
276 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
277 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType,
278 const std::filesystem::path& path)
279 {
280 namespace fs = std::filesystem;
281
282 std::string chipName;
283 std::string event;
284 bool dumpIsRequired = false;
285 bool isDumpFailure = true;
286 try
287 {
288 chipName = sbeTypeAttributes.at(sbeType).chipName;
289 event = sbeTypeAttributes.at(sbeType).chipOpFailure;
290
291 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
292 chipPos);
293
294 // Common FFDC data
295 openpower::dump::pel::FFDCData pelAdditionalData = {
296 {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
297
298 if (sbeType == SBETypes::OCMB)
299 {
300 pelAdditionalData.emplace_back(
301 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
302 }
303
304 // Check the error type
305 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
306 {
307 event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
308 dumpIsRequired = true;
309 // For timeout, we do not expect any FFDC packets
310 }
311 else if (sbeError.errType() ==
312 openpower::phal::exception::SBE_FFDC_NO_DATA)
313 {
314 // We will create a PEL without FFDC with the common information we
315 // added
316 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
317 "CHIP", chipName, "POSITION", chipPos);
318 event = sbeTypeAttributes.at(sbeType).noFfdc;
319 }
320 else
321 {
322 if (sbeError.errType() ==
323 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
324 {
325 lg2::info(
326 "FFDC Not related to chip-op present {CHIP} {POSITION}",
327 "CHIP", chipName, "POSITION", chipPos);
328 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
329 isDumpFailure = false;
330 }
331 else
332 {
333 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
334 "POSITION", chipPos);
335 }
336 // Processor FFDC Packets
337 std::vector<uint32_t> logIdList =
338 openpower::dump::pel::processFFDCPackets(sbeError, event,
339 pelAdditionalData);
340 for (auto logId : logIdList)
341 {
342 try
343 {
344 auto logInfo = openpower::dump::pel::getLogInfo(logId);
345 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
346 chipName, chipPos, path.parent_path());
347 }
348 catch (const std::exception& e)
349 {
350 lg2::error("Failed to get error Info: {ERROR} ", "ERROR",
351 e);
352 }
353 }
354 }
355
356 // If dump is required, request it
357 if (dumpIsRequired)
358 {
359 auto logId = openpower::dump::pel::createSbeErrorPEL(
360 event, sbeError, pelAdditionalData);
361 try
362 {
363 auto logInfo = openpower::dump::pel::getLogInfo(logId);
364 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
365 chipName, chipPos, path.parent_path());
366 util::requestSBEDump(chipPos, std::get<0>(logInfo), sbeType);
367 }
368 catch (const std::exception& e)
369 {
370 lg2::error(
371 "Failed to get error Info, failed to create sbe dump: {ERROR}",
372 "ERROR", e);
373 }
374 }
375 }
376 catch (const std::out_of_range& e)
377 {
378 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
379 sbeType, "ERROR", e);
380 }
381 catch (const std::exception& e)
382 {
383 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
384 "position({CHIPPOS}), Error: {ERROR}",
385 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
386 }
387
388 return isDumpFailure;
389 }
390
collectDumpFromSBE(struct pdbg_target * chip,const std::filesystem::path & path,uint32_t id,uint8_t type,uint8_t clockState,uint64_t failingUnit)391 void SbeDumpCollector::collectDumpFromSBE(
392 struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id,
393 uint8_t type, uint8_t clockState, uint64_t failingUnit)
394 {
395 auto chipPos = pdbg_target_index(chip);
396 SBETypes sbeType = getSBEType(chip);
397 auto chipName = sbeTypeAttributes.at(sbeType).chipName;
398 lg2::info(
399 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
400 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
401 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
402 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
403
404 util::DumpDataPtr dataPtr;
405 uint32_t len = 0;
406 uint8_t collectFastArray =
407 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
408
409 try
410 {
411 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
412 dataPtr.getPtr(), &len);
413 }
414 catch (const openpower::phal::sbeError_t& sbeError)
415 {
416 if (sbeError.errType() ==
417 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
418 {
419 // SBE is not ready to accept chip-ops,
420 // Skip the request, no additional error handling required.
421 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
422 "on proc({PROC}) clock state({CLOCKSTATE})",
423 "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
424 "CLOCKSTATE", clockState);
425 return;
426 }
427
428 // If the FFDC is from actual chip-op failure this function will
429 // return true, if the chip-op is not failed but FFDC is present
430 // then create PELs with FFDC but write the dump contents to the
431 // file.
432 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
433 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP,
434 path))
435 {
436 lg2::error("Error in collecting dump dump type({TYPE}), "
437 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
438 "position({POSITION}), "
439 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
440 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
441 chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
442 collectFastArray, "ERROR", sbeError);
443 return;
444 }
445 }
446 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
447 }
448
writeDumpFile(const std::filesystem::path & path,const uint32_t id,const uint8_t clockState,const uint8_t nodeNum,const std::string & chipName,const uint8_t chipPos,util::DumpDataPtr & dataPtr,const uint32_t len)449 void SbeDumpCollector::writeDumpFile(
450 const std::filesystem::path& path, const uint32_t id,
451 const uint8_t clockState, const uint8_t nodeNum,
452 const std::string& chipName, const uint8_t chipPos,
453 util::DumpDataPtr& dataPtr, const uint32_t len)
454 {
455 using namespace sdbusplus::xyz::openbmc_project::Common::Error;
456 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
457
458 // Construct the filename
459 std::ostringstream filenameBuilder;
460 filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
461 << ".SbeDataClocks"
462 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
463 << std::dec << static_cast<int>(nodeNum) << "." << chipName
464 << static_cast<int>(chipPos);
465
466 auto dumpPath = path / filenameBuilder.str();
467
468 // Attempt to open the file
469 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
470 if (!outfile)
471 {
472 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
473 using metadata = xyz::openbmc_project::Common::File::Open;
474 // Unable to open the file for writing
475 auto err = errno;
476 lg2::error("Error opening file to write dump, "
477 "errno({ERRNO}), filepath({FILEPATH})",
478 "ERRNO", err, "FILEPATH", dumpPath.string());
479
480 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
481 // Just return here, so that the dumps collected from other
482 // SBEs can be packaged.
483 return;
484 }
485
486 // Write to the file
487 try
488 {
489 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
490
491 lg2::info("Successfully wrote dump file "
492 "path=({PATH}) size=({SIZE})",
493 "PATH", dumpPath.string(), "SIZE", len);
494 }
495 catch (const std::ofstream::failure& oe)
496 {
497 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
498 using metadata = xyz::openbmc_project::Common::File::Write;
499
500 lg2::error(
501 "Failed to write to dump file, "
502 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
503 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
504 dumpPath.string());
505 report<Write>(metadata::ERRNO(oe.code().value()),
506 metadata::PATH(dumpPath.c_str()));
507 // Just return here so dumps collected from other SBEs can be
508 // packaged.
509 }
510 }
511
executeThreadStop(struct pdbg_target * target,const std::filesystem::path & path)512 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target,
513 const std::filesystem::path& path)
514 {
515 try
516 {
517 openpower::phal::sbe::threadStopProc(target);
518 return true;
519 }
520 catch (const openpower::phal::sbeError_t& sbeError)
521 {
522 uint64_t chipPos = pdbg_target_index(target);
523 if (sbeError.errType() ==
524 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
525 {
526 lg2::info("SBE is not ready to accept chip-op: Skipping "
527 "stop instruction on proc-({POSITION}) error({ERROR}) ",
528 "POSITION", chipPos, "ERROR", sbeError);
529 return false; // Do not include the target for dump collection
530 }
531
532 lg2::error("Stop instructions failed on "
533 "proc-({POSITION}) error({ERROR}) ",
534 "POSITION", chipPos, "ERROR", sbeError);
535
536 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
537 SBEFIFO_CMD_CLASS_INSTRUCTION,
538 SBEFIFO_CMD_CONTROL_INSN, path);
539 // For TIMEOUT, log the error and skip adding the processor for dump
540 // collection
541 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
542 {
543 return false;
544 }
545 }
546 // Include the target for dump collection for SBE_CMD_FAILED or any other
547 // non-critical errors
548 return true;
549 }
550
addLogDataToDump(uint32_t pelId,std::string src,std::string chipName,uint64_t chipPos,const std::filesystem::path & path)551 void SbeDumpCollector::addLogDataToDump(uint32_t pelId, std::string src,
552 std::string chipName, uint64_t chipPos,
553 const std::filesystem::path& path)
554 {
555 std::filesystem::path info = path / "errorInfo";
556 auto fileExists = std::filesystem::exists(info);
557 std::ofstream fout;
558 fout.open(info, std::ios::app);
559 if (!fout)
560 {
561 lg2::error("Error: Failed to open the file! {FILE}", "FILE", info);
562 lg2::error("No error Info is added to dump file");
563 return;
564 }
565 if (!fileExists)
566 {
567 fout << "ErrorInfo:" << std::endl;
568 }
569 auto pel = " " + std::format("{:08x}", pelId) + ":";
570 fout << pel << std::endl;
571 fout << " src: " << src << std::endl;
572 auto resource = chipName + " " + std::to_string(chipPos);
573 fout << " Resource: " << resource << std::endl;
574 }
575
576 } // namespace openpower::dump::sbe_chipop
577