xref: /openbmc/openpower-debug-collector/dump/sbe_dump_collector.cpp (revision a24ae1772284558976e2cdf4fe9f314ca5ec72e1)
1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6 
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11 
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15 
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22 
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <stdexcept>
28 
29 namespace openpower::dump::sbe_chipop
30 {
31 
32 using namespace phosphor::logging;
33 using namespace openpower::dump::SBE;
34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
35 
36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
37                                    uint64_t failingUnit,
38                                    const std::filesystem::path& path)
39 {
40     lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
41                "failingUnit:{FAILINGUNIT}, path:{PATH}",
42                "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
43                path.string());
44 
45     initializePdbg();
46 
47     TargetMap targets;
48 
49     struct pdbg_target* target = nullptr;
50     pdbg_for_each_class_target("proc", target)
51     {
52         if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
53             !openpower::phal::pdbg::isTgtFunctional(target))
54         {
55             continue;
56         }
57 
58         bool includeTarget = true;
59         // if the dump type is hostboot then call stop instructions
60         if (type == SBE_DUMP_TYPE_HOSTBOOT)
61         {
62             includeTarget = executeThreadStop(target, path);
63         }
64         if (includeTarget)
65         {
66             targets[target] = std::vector<struct pdbg_target*>();
67 
68             // Hardware dump needs OCMB data if present
69             if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
70             {
71                 struct pdbg_target* ocmbTarget;
72                 pdbg_for_each_target("ocmb", target, ocmbTarget)
73                 {
74                     if (!is_ody_ocmb_chip(ocmbTarget))
75                     {
76                         continue;
77                     }
78 
79                     if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
80                     {
81                         continue;
82                     }
83 
84                     if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
85                     {
86                         continue;
87                     }
88                     targets[target].push_back(ocmbTarget);
89                 }
90             }
91         }
92     }
93 
94     std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
95     for (auto cstate : clockStates)
96     {
97         // Skip collection for performance dump if clock state is not ON
98         if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
99         {
100             continue;
101         }
102         auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
103                                                     cstate, targets);
104 
105         // Wait for all asynchronous tasks to complete
106         for (auto& future : futures)
107         {
108             try
109             {
110                 future.wait();
111             }
112             catch (const std::exception& e)
113             {
114                 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
115                            "ERROR", e);
116             }
117         }
118         lg2::info(
119             "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
120             "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
121             "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
122             failingUnit, "PATH", path.string());
123     }
124     if (std::filesystem::is_empty(path))
125     {
126         lg2::error("Failed to collect the dump");
127         throw std::runtime_error("Failed to collect the dump");
128     }
129     lg2::info("Dump collection completed");
130 }
131 
132 void SbeDumpCollector::initializePdbg()
133 {
134     openpower::phal::pdbg::init();
135 }
136 
137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
138     uint8_t type, uint32_t id, const std::filesystem::path& path,
139     uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
140 {
141     std::vector<std::future<void>> futures;
142 
143     for (const auto& [procTarget, ocmbTargets] : targetMap)
144     {
145         auto future = std::async(std::launch::async, [this, procTarget,
146                                                       ocmbTargets, path, id,
147                                                       type, cstate,
148                                                       failingUnit]() {
149             try
150             {
151                 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
152                                          failingUnit);
153             }
154             catch (const std::exception& e)
155             {
156                 lg2::error(
157                     "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
158                     "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
159             }
160 
161             // Collect OCMBs only with clock on
162             if (cstate == SBE_CLOCK_ON)
163             {
164                 // Handle OCMBs serially after handling the proc
165                 for (auto ocmbTarget : ocmbTargets)
166                 {
167                     try
168                     {
169                         this->collectDumpFromSBE(ocmbTarget, path, id, type,
170                                                  cstate, failingUnit);
171                     }
172                     catch (const std::exception& e)
173                     {
174                         lg2::error(
175                             "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
176                             "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
177                             e);
178                     }
179                 }
180             }
181         });
182 
183         futures.push_back(std::move(future));
184     }
185 
186     return futures;
187 }
188 
189 bool SbeDumpCollector::logErrorAndCreatePEL(
190     const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
191     SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType,
192     const std::filesystem::path& path)
193 {
194     namespace fs = std::filesystem;
195 
196     std::string chipName;
197     std::string event;
198     bool dumpIsRequired = false;
199     bool isDumpFailure = true;
200     try
201     {
202         chipName = sbeTypeAttributes.at(sbeType).chipName;
203         event = sbeTypeAttributes.at(sbeType).chipOpFailure;
204 
205         lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
206                   chipPos);
207 
208         // Common FFDC data
209         openpower::dump::pel::FFDCData pelAdditionalData = {
210             {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
211 
212         if (sbeType == SBETypes::OCMB)
213         {
214             pelAdditionalData.emplace_back(
215                 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
216         }
217 
218         // Check the error type
219         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
220         {
221             event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
222             dumpIsRequired = true;
223             // For timeout, we do not expect any FFDC packets
224         }
225         else if (sbeError.errType() ==
226                  openpower::phal::exception::SBE_FFDC_NO_DATA)
227         {
228             // We will create a PEL without FFDC with the common information we
229             // added
230             lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
231                        "CHIP", chipName, "POSITION", chipPos);
232             event = sbeTypeAttributes.at(sbeType).noFfdc;
233         }
234         else
235         {
236             if (sbeError.errType() ==
237                 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
238             {
239                 lg2::info(
240                     "FFDC Not related to chip-op present {CHIP} {POSITION}",
241                     "CHIP", chipName, "POSITION", chipPos);
242                 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
243                 isDumpFailure = false;
244             }
245             else
246             {
247                 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
248                            "POSITION", chipPos);
249             }
250             // Processor FFDC Packets
251             std::vector<uint32_t> logIdList =
252                 openpower::dump::pel::processFFDCPackets(sbeError, event,
253                                                          pelAdditionalData);
254             for (auto logId : logIdList)
255             {
256                 try
257                 {
258                     auto logInfo = openpower::dump::pel::getLogInfo(logId);
259                     addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
260                                      chipName, chipPos, path.parent_path());
261                 }
262                 catch (const std::exception& e)
263                 {
264                     lg2::error("Failed to get error Info: {ERROR} ", "ERROR",
265                                e);
266                 }
267             }
268         }
269 
270         // If dump is required, request it
271         if (dumpIsRequired)
272         {
273             auto logId = openpower::dump::pel::createSbeErrorPEL(
274                 event, sbeError, pelAdditionalData);
275             try
276             {
277                 auto logInfo = openpower::dump::pel::getLogInfo(logId);
278                 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo),
279                                  chipName, chipPos, path.parent_path());
280                 util::requestSBEDump(chipPos, std::get<0>(logInfo), sbeType);
281             }
282             catch (const std::exception& e)
283             {
284                 lg2::error(
285                     "Failed to get error Info, failed to create sbe dump: {ERROR}",
286                     "ERROR", e);
287             }
288         }
289     }
290     catch (const std::out_of_range& e)
291     {
292         lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
293                    sbeType, "ERROR", e);
294     }
295     catch (const std::exception& e)
296     {
297         lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
298                    "position({CHIPPOS}), Error: {ERROR}",
299                    "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
300     }
301 
302     return isDumpFailure;
303 }
304 
305 void SbeDumpCollector::collectDumpFromSBE(
306     struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id,
307     uint8_t type, uint8_t clockState, uint64_t failingUnit)
308 {
309     auto chipPos = pdbg_target_index(chip);
310     SBETypes sbeType = getSBEType(chip);
311     auto chipName = sbeTypeAttributes.at(sbeType).chipName;
312     lg2::info(
313         "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
314         "type({TYPE})  clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
315         "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
316         id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
317 
318     util::DumpDataPtr dataPtr;
319     uint32_t len = 0;
320     uint8_t collectFastArray =
321         checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
322 
323     try
324     {
325         openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
326                                       dataPtr.getPtr(), &len);
327     }
328     catch (const openpower::phal::sbeError_t& sbeError)
329     {
330         if (sbeError.errType() ==
331             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
332         {
333             // SBE is not ready to accept chip-ops,
334             // Skip the request, no additional error handling required.
335             lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
336                       "on proc({PROC}) clock state({CLOCKSTATE})",
337                       "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
338                       "CLOCKSTATE", clockState);
339             return;
340         }
341 
342         // If the FFDC is from actual chip-op failure this function will
343         // return true, if the chip-op is not failed but FFDC is present
344         // then create PELs with FFDC but write the dump contents to the
345         // file.
346         if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
347                                  SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP,
348                                  path))
349         {
350             lg2::error("Error in collecting dump dump type({TYPE}), "
351                        "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
352                        "position({POSITION}), "
353                        "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
354                        "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
355                        chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
356                        collectFastArray, "ERROR", sbeError);
357             return;
358         }
359     }
360     writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
361 }
362 
363 void SbeDumpCollector::writeDumpFile(
364     const std::filesystem::path& path, const uint32_t id,
365     const uint8_t clockState, const uint8_t nodeNum,
366     const std::string& chipName, const uint8_t chipPos,
367     util::DumpDataPtr& dataPtr, const uint32_t len)
368 {
369     using namespace sdbusplus::xyz::openbmc_project::Common::Error;
370     namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
371 
372     // Construct the filename
373     std::ostringstream filenameBuilder;
374     filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
375                     << ".SbeDataClocks"
376                     << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
377                     << std::dec << static_cast<int>(nodeNum) << "." << chipName
378                     << static_cast<int>(chipPos);
379 
380     auto dumpPath = path / filenameBuilder.str();
381 
382     // Attempt to open the file
383     std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
384     if (!outfile)
385     {
386         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
387         using metadata = xyz::openbmc_project::Common::File::Open;
388         // Unable to open the file for writing
389         auto err = errno;
390         lg2::error("Error opening file to write dump, "
391                    "errno({ERRNO}), filepath({FILEPATH})",
392                    "ERRNO", err, "FILEPATH", dumpPath.string());
393 
394         report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
395         // Just return here, so that the dumps collected from other
396         // SBEs can be packaged.
397         return;
398     }
399 
400     // Write to the file
401     try
402     {
403         outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
404 
405         lg2::info("Successfully wrote dump file "
406                   "path=({PATH}) size=({SIZE})",
407                   "PATH", dumpPath.string(), "SIZE", len);
408     }
409     catch (const std::ofstream::failure& oe)
410     {
411         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
412         using metadata = xyz::openbmc_project::Common::File::Write;
413 
414         lg2::error(
415             "Failed to write to dump file, "
416             "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
417             "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
418             dumpPath.string());
419         report<Write>(metadata::ERRNO(oe.code().value()),
420                       metadata::PATH(dumpPath.c_str()));
421         // Just return here so dumps collected from other SBEs can be
422         // packaged.
423     }
424 }
425 
426 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target,
427                                          const std::filesystem::path& path)
428 {
429     try
430     {
431         openpower::phal::sbe::threadStopProc(target);
432         return true;
433     }
434     catch (const openpower::phal::sbeError_t& sbeError)
435     {
436         uint64_t chipPos = pdbg_target_index(target);
437         if (sbeError.errType() ==
438             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
439         {
440             lg2::info("SBE is not ready to accept chip-op: Skipping "
441                       "stop instruction on proc-({POSITION}) error({ERROR}) ",
442                       "POSITION", chipPos, "ERROR", sbeError);
443             return false; // Do not include the target for dump collection
444         }
445 
446         lg2::error("Stop instructions failed on "
447                    "proc-({POSITION}) error({ERROR}) ",
448                    "POSITION", chipPos, "ERROR", sbeError);
449 
450         logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
451                              SBEFIFO_CMD_CLASS_INSTRUCTION,
452                              SBEFIFO_CMD_CONTROL_INSN, path);
453         // For TIMEOUT, log the error and skip adding the processor for dump
454         // collection
455         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
456         {
457             return false;
458         }
459     }
460     // Include the target for dump collection for SBE_CMD_FAILED or any other
461     // non-critical errors
462     return true;
463 }
464 
465 void SbeDumpCollector::addLogDataToDump(uint32_t pelId, std::string src,
466                                         std::string chipName, uint64_t chipPos,
467                                         const std::filesystem::path& path)
468 {
469     std::filesystem::path info = path / "errorInfo";
470     auto fileExists = std::filesystem::exists(info);
471     std::ofstream fout;
472     fout.open(info, std::ios::app);
473     if (!fout)
474     {
475         lg2::error("Error: Failed to open the file! {FILE}", "FILE", info);
476         lg2::error("No error Info is added to dump file");
477         return;
478     }
479     if (!fileExists)
480     {
481         fout << "ErrorInfo:" << std::endl;
482     }
483     auto pel = " " + std::format("{:08x}", pelId) + ":";
484     fout << pel << std::endl;
485     fout << "  src: " << src << std::endl;
486     auto resource = chipName + " " + std::to_string(chipPos);
487     fout << "  Resource: " << resource << std::endl;
488 }
489 
490 } // namespace openpower::dump::sbe_chipop
491