xref: /openbmc/openpower-debug-collector/dump/sbe_dump_collector.cpp (revision fc4f223cb730ae868732532c0bf85a5b62b84852)
1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6 
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11 
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15 
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22 
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <map>
28 #include <stdexcept>
29 
30 namespace openpower::dump::sbe_chipop
31 {
32 
33 using namespace phosphor::logging;
34 using namespace openpower::dump::SBE;
35 using namespace openpower::phal::dump;
36 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
37 
collectDump(uint8_t type,uint32_t id,uint32_t failingUnit,const std::filesystem::path & path)38 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
39                                    uint32_t failingUnit,
40                                    const std::filesystem::path& path)
41 {
42     if ((type == SBE_DUMP_TYPE_SBE) || (type == SBE_DUMP_TYPE_MSBE))
43     {
44         collectSBEDump(id, failingUnit, path, type);
45         return;
46     }
47     collectHWHBDump(type, id, failingUnit, path);
48 }
49 
collectHWHBDump(uint8_t type,uint32_t id,uint64_t failingUnit,const std::filesystem::path & path)50 void SbeDumpCollector::collectHWHBDump(uint8_t type, uint32_t id,
51                                        uint64_t failingUnit,
52                                        const std::filesystem::path& path)
53 {
54     lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
55                "failingUnit:{FAILINGUNIT}, path:{PATH}",
56                "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
57                path.string());
58 
59     initializePdbg();
60 
61     TargetMap targets;
62 
63     struct pdbg_target* target = nullptr;
64     pdbg_for_each_class_target("proc", target)
65     {
66         if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
67             !openpower::phal::pdbg::isTgtFunctional(target))
68         {
69             continue;
70         }
71 
72         bool includeTarget = true;
73         // if the dump type is hostboot then call stop instructions
74         if (type == SBE_DUMP_TYPE_HOSTBOOT)
75         {
76             includeTarget = executeThreadStop(target, path);
77         }
78         if (includeTarget)
79         {
80             targets[target] = std::vector<struct pdbg_target*>();
81 
82             // Hardware dump needs OCMB data if present
83             if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
84             {
85                 struct pdbg_target* ocmbTarget;
86                 pdbg_for_each_target("ocmb", target, ocmbTarget)
87                 {
88                     if (!is_ody_ocmb_chip(ocmbTarget))
89                     {
90                         continue;
91                     }
92 
93                     if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
94                     {
95                         continue;
96                     }
97 
98                     if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
99                     {
100                         continue;
101                     }
102                     targets[target].push_back(ocmbTarget);
103                 }
104             }
105         }
106     }
107 
108     std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
109     for (auto cstate : clockStates)
110     {
111         // Skip collection for performance dump if clock state is not ON
112         if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
113         {
114             continue;
115         }
116         auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
117                                                     cstate, targets);
118 
119         // Wait for all asynchronous tasks to complete
120         for (auto& future : futures)
121         {
122             try
123             {
124                 future.wait();
125             }
126             catch (const std::exception& e)
127             {
128                 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
129                            "ERROR", e);
130             }
131         }
132         lg2::info(
133             "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
134             "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
135             "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
136             failingUnit, "PATH", path.string());
137     }
138     if (std::filesystem::is_empty(path))
139     {
140         lg2::error("Failed to collect the dump");
141         throw std::runtime_error("Failed to collect the dump");
142     }
143     lg2::info("Dump collection completed");
144 }
145 
collectSBEDump(uint32_t id,uint32_t failingUnit,const std::filesystem::path & dumpPath,const int sbeTypeId)146 void SbeDumpCollector::collectSBEDump(uint32_t id, uint32_t failingUnit,
147                                       const std::filesystem::path& dumpPath,
148                                       const int sbeTypeId)
149 {
150     lg2::info("Collecting SBE dump: path={PATH}, id={ID}, "
151               "chip position={FAILINGUNIT}",
152               "PATH", dumpPath.string().c_str(), "ID", id, "FAILINGUNIT",
153               failingUnit);
154 
155     struct pdbg_target* proc_ody = nullptr;
156     struct pdbg_target* pibFsiTarget = nullptr;
157     std::string sbeChipType;
158 
159     try
160     {
161         // Execute pre-collection steps and get the proc target
162         initializePdbgLibEkb();
163 
164         proc_ody = getTargetFromFailingId(failingUnit, sbeTypeId);
165         if (PROC_SBE_DUMP == sbeTypeId)
166         {
167             pibFsiTarget = probeTarget(proc_ody, "pib", sbeTypeId);
168             sbeChipType = "_p10_";
169         }
170         else
171         {
172             pibFsiTarget = probeTarget(proc_ody, "fsi", sbeTypeId);
173             sbeChipType = "_ody_";
174         }
175     }
176     catch (const std::exception& e)
177     {
178         lg2::error("Failed to collect the SBE dump: {ERROR}", "ERROR",
179                    e.what());
180         throw;
181     }
182 
183     std::stringstream ss;
184     ss << std::setw(8) << std::setfill('0') << id;
185 
186     std::string baseFilename = ss.str() + ".0_" + std::to_string(failingUnit) +
187                                "_SbeData" + sbeChipType;
188 
189     try
190     {
191         checkSbeState(pibFsiTarget, sbeTypeId);
192 
193         executeSbeExtractRc(proc_ody, dumpPath, sbeTypeId);
194 
195         // Collect various dumps
196         collectLocalRegDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
197         collectPIBMSRegDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
198         collectPIBMEMDump(proc_ody, dumpPath, baseFilename, sbeTypeId);
199         collectPPEState(proc_ody, dumpPath, baseFilename, sbeTypeId);
200 
201         // Finalize the collection process and indicate successful completion
202         finalizeCollection(pibFsiTarget, dumpPath, true, sbeTypeId);
203 
204         lg2::info("SBE dump collection completed successfully");
205     }
206     catch (const std::exception& e)
207     {
208         lg2::error("Failed to collect the SBE dump: {ERROR}", "ERROR",
209                    e.what());
210         // In case of any exception, attempt to finalize with a failure
211         // state
212         if (proc_ody)
213             finalizeCollection(pibFsiTarget, dumpPath, false, sbeTypeId);
214         throw;
215     }
216 }
217 
initializePdbg()218 void SbeDumpCollector::initializePdbg()
219 {
220     openpower::phal::pdbg::init();
221 }
222 
spawnDumpCollectionProcesses(uint8_t type,uint32_t id,const std::filesystem::path & path,uint64_t failingUnit,uint8_t cstate,const TargetMap & targetMap)223 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
224     uint8_t type, uint32_t id, const std::filesystem::path& path,
225     uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
226 {
227     std::vector<std::future<void>> futures;
228 
229     for (const auto& [procTarget, ocmbTargets] : targetMap)
230     {
231         auto future = std::async(std::launch::async, [this, procTarget,
232                                                       ocmbTargets, path, id,
233                                                       type, cstate,
234                                                       failingUnit]() {
235             try
236             {
237                 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
238                                          failingUnit);
239             }
240             catch (const std::exception& e)
241             {
242                 lg2::error(
243                     "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
244                     "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
245             }
246 
247             // Collect OCMBs only with clock on
248             if (cstate == SBE_CLOCK_ON)
249             {
250                 // Handle OCMBs serially after handling the proc
251                 for (auto ocmbTarget : ocmbTargets)
252                 {
253                     try
254                     {
255                         this->collectDumpFromSBE(ocmbTarget, path, id, type,
256                                                  cstate, failingUnit);
257                     }
258                     catch (const std::exception& e)
259                     {
260                         lg2::error(
261                             "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
262                             "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
263                             e);
264                     }
265                 }
266             }
267         });
268 
269         futures.push_back(std::move(future));
270     }
271 
272     return futures;
273 }
274 
logErrorAndCreatePEL(const openpower::phal::sbeError_t & sbeError,uint64_t chipPos,SBETypes sbeType,uint32_t cmdClass,uint32_t cmdType,const std::filesystem::path & path)275 bool SbeDumpCollector::logErrorAndCreatePEL(
276     const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
277     SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType,
278     const std::filesystem::path& path)
279 {
280     namespace fs = std::filesystem;
281 
282     std::string chipName;
283     std::string event;
284     bool dumpIsRequired = false;
285     bool isDumpFailure = true;
286     try
287     {
288         chipName = sbeTypeAttributes.at(sbeType).chipName;
289         event = sbeTypeAttributes.at(sbeType).chipOpFailure;
290 
291         lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
292                   chipPos);
293 
294         // Common FFDC data
295         openpower::dump::pel::FFDCData pelAdditionalData = {
296             {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
297 
298         if (sbeType == SBETypes::OCMB)
299         {
300             pelAdditionalData.emplace_back(
301                 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
302         }
303 
304         // Check the error type
305         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
306         {
307             event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
308             dumpIsRequired = true;
309             // For timeout, we do not expect any FFDC packets
310         }
311         else if (sbeError.errType() ==
312                  openpower::phal::exception::SBE_FFDC_NO_DATA)
313         {
314             // We will create a PEL without FFDC with the common information we
315             // added
316             lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
317                        "CHIP", chipName, "POSITION", chipPos);
318             event = sbeTypeAttributes.at(sbeType).noFfdc;
319         }
320         else
321         {
322             if (sbeError.errType() ==
323                 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
324             {
325                 lg2::info(
326                     "FFDC Not related to chip-op present {CHIP} {POSITION}",
327                     "CHIP", chipName, "POSITION", chipPos);
328                 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
329                 isDumpFailure = false;
330             }
331             else
332             {
333                 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
334                            "POSITION", chipPos);
335             }
336             // Processor FFDC Packets
337             std::vector<uint32_t> logIdList =
338                 openpower::dump::pel::processFFDCPackets(sbeError, event,
339                                                          pelAdditionalData);
340             for (auto logId : logIdList)
341             {
342                 try
343                 {
344                     auto logInfo = openpower::dump::pel::getLogInfo(logId);
345                     addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
346                                      chipName, chipPos, path.parent_path());
347                 }
348                 catch (const std::exception& e)
349                 {
350                     lg2::error("Failed to get error Info: {ERROR} ", "ERROR",
351                                e);
352                 }
353             }
354         }
355 
356         // If dump is required, request it
357         if (dumpIsRequired)
358         {
359             auto logId = openpower::dump::pel::createSbeErrorPEL(
360                 event, sbeError, pelAdditionalData);
361             try
362             {
363                 auto logInfo = openpower::dump::pel::getLogInfo(logId);
364                 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
365                                  chipName, chipPos, path.parent_path());
366                 util::requestSBEDump(chipPos, std::get<0>(logInfo), sbeType);
367             }
368             catch (const std::exception& e)
369             {
370                 lg2::error(
371                     "Failed to get error Info, failed to create sbe dump: {ERROR}",
372                     "ERROR", e);
373             }
374         }
375     }
376     catch (const std::out_of_range& e)
377     {
378         lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
379                    sbeType, "ERROR", e);
380     }
381     catch (const std::exception& e)
382     {
383         lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
384                    "position({CHIPPOS}), Error: {ERROR}",
385                    "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
386     }
387 
388     return isDumpFailure;
389 }
390 
collectDumpFromSBE(struct pdbg_target * chip,const std::filesystem::path & path,uint32_t id,uint8_t type,uint8_t clockState,uint64_t failingUnit)391 void SbeDumpCollector::collectDumpFromSBE(
392     struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id,
393     uint8_t type, uint8_t clockState, uint64_t failingUnit)
394 {
395     auto chipPos = pdbg_target_index(chip);
396     SBETypes sbeType = getSBEType(chip);
397     auto chipName = sbeTypeAttributes.at(sbeType).chipName;
398     lg2::info(
399         "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
400         "type({TYPE})  clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
401         "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
402         id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
403 
404     util::DumpDataPtr dataPtr;
405     uint32_t len = 0;
406     uint8_t collectFastArray =
407         checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
408 
409     try
410     {
411         openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
412                                       dataPtr.getPtr(), &len);
413     }
414     catch (const openpower::phal::sbeError_t& sbeError)
415     {
416         if (sbeError.errType() ==
417             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
418         {
419             // SBE is not ready to accept chip-ops,
420             // Skip the request, no additional error handling required.
421             lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
422                       "on proc({PROC}) clock state({CLOCKSTATE})",
423                       "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
424                       "CLOCKSTATE", clockState);
425             return;
426         }
427 
428         // If the FFDC is from actual chip-op failure this function will
429         // return true, if the chip-op is not failed but FFDC is present
430         // then create PELs with FFDC but write the dump contents to the
431         // file.
432         if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
433                                  SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP,
434                                  path))
435         {
436             lg2::error("Error in collecting dump dump type({TYPE}), "
437                        "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
438                        "position({POSITION}), "
439                        "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
440                        "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
441                        chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
442                        collectFastArray, "ERROR", sbeError);
443             return;
444         }
445     }
446     writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
447 }
448 
writeDumpFile(const std::filesystem::path & path,const uint32_t id,const uint8_t clockState,const uint8_t nodeNum,const std::string & chipName,const uint8_t chipPos,util::DumpDataPtr & dataPtr,const uint32_t len)449 void SbeDumpCollector::writeDumpFile(
450     const std::filesystem::path& path, const uint32_t id,
451     const uint8_t clockState, const uint8_t nodeNum,
452     const std::string& chipName, const uint8_t chipPos,
453     util::DumpDataPtr& dataPtr, const uint32_t len)
454 {
455     using namespace sdbusplus::xyz::openbmc_project::Common::Error;
456     namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
457 
458     // Construct the filename
459     std::ostringstream filenameBuilder;
460     filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
461                     << ".SbeDataClocks"
462                     << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
463                     << std::dec << static_cast<int>(nodeNum) << "." << chipName
464                     << static_cast<int>(chipPos);
465 
466     auto dumpPath = path / filenameBuilder.str();
467 
468     // Attempt to open the file
469     std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
470     if (!outfile)
471     {
472         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
473         using metadata = xyz::openbmc_project::Common::File::Open;
474         // Unable to open the file for writing
475         auto err = errno;
476         lg2::error("Error opening file to write dump, "
477                    "errno({ERRNO}), filepath({FILEPATH})",
478                    "ERRNO", err, "FILEPATH", dumpPath.string());
479 
480         report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
481         // Just return here, so that the dumps collected from other
482         // SBEs can be packaged.
483         return;
484     }
485 
486     // Write to the file
487     try
488     {
489         outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
490 
491         lg2::info("Successfully wrote dump file "
492                   "path=({PATH}) size=({SIZE})",
493                   "PATH", dumpPath.string(), "SIZE", len);
494     }
495     catch (const std::ofstream::failure& oe)
496     {
497         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
498         using metadata = xyz::openbmc_project::Common::File::Write;
499 
500         lg2::error(
501             "Failed to write to dump file, "
502             "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
503             "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
504             dumpPath.string());
505         report<Write>(metadata::ERRNO(oe.code().value()),
506                       metadata::PATH(dumpPath.c_str()));
507         // Just return here so dumps collected from other SBEs can be
508         // packaged.
509     }
510 }
511 
executeThreadStop(struct pdbg_target * target,const std::filesystem::path & path)512 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target,
513                                          const std::filesystem::path& path)
514 {
515     try
516     {
517         openpower::phal::sbe::threadStopProc(target);
518         return true;
519     }
520     catch (const openpower::phal::sbeError_t& sbeError)
521     {
522         uint64_t chipPos = pdbg_target_index(target);
523         if (sbeError.errType() ==
524             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
525         {
526             lg2::info("SBE is not ready to accept chip-op: Skipping "
527                       "stop instruction on proc-({POSITION}) error({ERROR}) ",
528                       "POSITION", chipPos, "ERROR", sbeError);
529             return false; // Do not include the target for dump collection
530         }
531 
532         lg2::error("Stop instructions failed on "
533                    "proc-({POSITION}) error({ERROR}) ",
534                    "POSITION", chipPos, "ERROR", sbeError);
535 
536         logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
537                              SBEFIFO_CMD_CLASS_INSTRUCTION,
538                              SBEFIFO_CMD_CONTROL_INSN, path);
539         // For TIMEOUT, log the error and skip adding the processor for dump
540         // collection
541         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
542         {
543             return false;
544         }
545     }
546     // Include the target for dump collection for SBE_CMD_FAILED or any other
547     // non-critical errors
548     return true;
549 }
550 
addLogDataToDump(uint32_t pelId,std::string src,std::string chipName,uint64_t chipPos,const std::filesystem::path & path)551 void SbeDumpCollector::addLogDataToDump(uint32_t pelId, std::string src,
552                                         std::string chipName, uint64_t chipPos,
553                                         const std::filesystem::path& path)
554 {
555     std::filesystem::path info = path / "errorInfo";
556     auto fileExists = std::filesystem::exists(info);
557     std::ofstream fout;
558     fout.open(info, std::ios::app);
559     if (!fout)
560     {
561         lg2::error("Error: Failed to open the file! {FILE}", "FILE", info);
562         lg2::error("No error Info is added to dump file");
563         return;
564     }
565     if (!fileExists)
566     {
567         fout << "ErrorInfo:" << std::endl;
568     }
569     auto pel = " " + std::format("{:08x}", pelId) + ":";
570     fout << pel << std::endl;
571     fout << "  src: " << src << std::endl;
572     auto resource = chipName + " " + std::to_string(chipPos);
573     fout << "  Resource: " << resource << std::endl;
574 }
575 
576 } // namespace openpower::dump::sbe_chipop
577