xref: /openbmc/openpower-debug-collector/dump/sbe_dump_collector.cpp (revision 540521edd78007b78f8e6df4b38ca71496862f25)
1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6 
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11 
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15 
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22 
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <stdexcept>
28 
29 namespace openpower::dump::sbe_chipop
30 {
31 
32 using namespace phosphor::logging;
33 using namespace openpower::dump::SBE;
34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
35 
collectDump(uint8_t type,uint32_t id,uint64_t failingUnit,const std::filesystem::path & path)36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
37                                    uint64_t failingUnit,
38                                    const std::filesystem::path& path)
39 {
40     lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
41                "failingUnit:{FAILINGUNIT}, path:{PATH}",
42                "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
43                path.string());
44 
45     initializePdbg();
46 
47     TargetMap targets;
48 
49     struct pdbg_target* target = nullptr;
50     pdbg_for_each_class_target("proc", target)
51     {
52         if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
53             !openpower::phal::pdbg::isTgtFunctional(target))
54         {
55             continue;
56         }
57 
58         bool includeTarget = true;
59         // if the dump type is hostboot then call stop instructions
60         if (type == SBE_DUMP_TYPE_HOSTBOOT)
61         {
62             includeTarget = executeThreadStop(target);
63         }
64         if (includeTarget)
65         {
66             targets[target] = std::vector<struct pdbg_target*>();
67 
68             // Hardware dump needs OCMB data if present
69             if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
70             {
71                 struct pdbg_target* ocmbTarget;
72                 pdbg_for_each_target("ocmb", target, ocmbTarget)
73                 {
74                     if (!is_ody_ocmb_chip(ocmbTarget))
75                     {
76                         continue;
77                     }
78 
79                     if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
80                     {
81                         continue;
82                     }
83 
84                     if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
85                     {
86                         continue;
87                     }
88                     targets[target].push_back(ocmbTarget);
89                 }
90             }
91         }
92     }
93 
94     std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
95     for (auto cstate : clockStates)
96     {
97         // Skip collection for performance dump if clock state is not ON
98         if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
99         {
100             continue;
101         }
102         auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
103                                                     cstate, targets);
104 
105         // Wait for all asynchronous tasks to complete
106         for (auto& future : futures)
107         {
108             try
109             {
110                 future.wait();
111             }
112             catch (const std::exception& e)
113             {
114                 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
115                            "ERROR", e);
116             }
117         }
118         lg2::info(
119             "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
120             "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
121             "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
122             failingUnit, "PATH", path.string());
123     }
124     if (std::filesystem::is_empty(path))
125     {
126         lg2::error("Failed to collect the dump");
127         throw std::runtime_error("Failed to collect the dump");
128     }
129     lg2::info("Dump collection completed");
130 }
131 
initializePdbg()132 void SbeDumpCollector::initializePdbg()
133 {
134     openpower::phal::pdbg::init();
135 }
136 
spawnDumpCollectionProcesses(uint8_t type,uint32_t id,const std::filesystem::path & path,uint64_t failingUnit,uint8_t cstate,const TargetMap & targetMap)137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
138     uint8_t type, uint32_t id, const std::filesystem::path& path,
139     uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
140 {
141     std::vector<std::future<void>> futures;
142 
143     for (const auto& [procTarget, ocmbTargets] : targetMap)
144     {
145         auto future = std::async(std::launch::async, [this, procTarget,
146                                                       ocmbTargets, path, id,
147                                                       type, cstate,
148                                                       failingUnit]() {
149             try
150             {
151                 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
152                                          failingUnit);
153             }
154             catch (const std::exception& e)
155             {
156                 lg2::error(
157                     "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
158                     "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
159             }
160 
161             // Collect OCMBs only with clock on
162             if (cstate == SBE_CLOCK_ON)
163             {
164                 // Handle OCMBs serially after handling the proc
165                 for (auto ocmbTarget : ocmbTargets)
166                 {
167                     try
168                     {
169                         this->collectDumpFromSBE(ocmbTarget, path, id, type,
170                                                  cstate, failingUnit);
171                     }
172                     catch (const std::exception& e)
173                     {
174                         lg2::error(
175                             "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
176                             "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
177                             e);
178                     }
179                 }
180             }
181         });
182 
183         futures.push_back(std::move(future));
184     }
185 
186     return futures;
187 }
188 
logErrorAndCreatePEL(const openpower::phal::sbeError_t & sbeError,uint64_t chipPos,SBETypes sbeType,uint32_t cmdClass,uint32_t cmdType)189 bool SbeDumpCollector::logErrorAndCreatePEL(
190     const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
191     SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType)
192 {
193     namespace fs = std::filesystem;
194 
195     std::string chipName;
196     std::string event;
197     bool dumpIsRequired = false;
198     bool isDumpFailure = true;
199     try
200     {
201         chipName = sbeTypeAttributes.at(sbeType).chipName;
202         event = sbeTypeAttributes.at(sbeType).chipOpFailure;
203 
204         lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
205                   chipPos);
206 
207         // Common FFDC data
208         openpower::dump::pel::FFDCData pelAdditionalData = {
209             {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
210 
211         if (sbeType == SBETypes::OCMB)
212         {
213             pelAdditionalData.emplace_back(
214                 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
215         }
216 
217         // Check the error type
218         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
219         {
220             event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
221             dumpIsRequired = true;
222             // For timeout, we do not expect any FFDC packets
223         }
224         else if (sbeError.errType() ==
225                  openpower::phal::exception::SBE_FFDC_NO_DATA)
226         {
227             // We will create a PEL without FFDC with the common information we
228             // added
229             lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
230                        "CHIP", chipName, "POSITION", chipPos);
231             event = sbeTypeAttributes.at(sbeType).noFfdc;
232         }
233         else
234         {
235             if (sbeError.errType() ==
236                 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
237             {
238                 lg2::info(
239                     "FFDC Not related to chip-op present {CHIP} {POSITION}",
240                     "CHIP", chipName, "POSITION", chipPos);
241                 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
242                 isDumpFailure = false;
243             }
244             else
245             {
246                 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
247                            "POSITION", chipPos);
248             }
249             // Processor FFDC Packets
250             openpower::dump::pel::processFFDCPackets(sbeError, event,
251                                                      pelAdditionalData);
252         }
253 
254         // If dump is required, request it
255         if (dumpIsRequired)
256         {
257             auto logId = openpower::dump::pel::createSbeErrorPEL(
258                 event, sbeError, pelAdditionalData);
259             util::requestSBEDump(chipPos, logId, sbeType);
260         }
261     }
262     catch (const std::out_of_range& e)
263     {
264         lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
265                    sbeType, "ERROR", e);
266     }
267     catch (const std::exception& e)
268     {
269         lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
270                    "position({CHIPPOS}), Error: {ERROR}",
271                    "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
272     }
273 
274     return isDumpFailure;
275 }
276 
collectDumpFromSBE(struct pdbg_target * chip,const std::filesystem::path & path,uint32_t id,uint8_t type,uint8_t clockState,uint64_t failingUnit)277 void SbeDumpCollector::collectDumpFromSBE(
278     struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id,
279     uint8_t type, uint8_t clockState, uint64_t failingUnit)
280 {
281     auto chipPos = pdbg_target_index(chip);
282     SBETypes sbeType = getSBEType(chip);
283     auto chipName = sbeTypeAttributes.at(sbeType).chipName;
284     lg2::info(
285         "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
286         "type({TYPE})  clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
287         "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
288         id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
289 
290     util::DumpDataPtr dataPtr;
291     uint32_t len = 0;
292     uint8_t collectFastArray =
293         checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
294 
295     try
296     {
297         openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
298                                       dataPtr.getPtr(), &len);
299     }
300     catch (const openpower::phal::sbeError_t& sbeError)
301     {
302         if (sbeError.errType() ==
303             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
304         {
305             // SBE is not ready to accept chip-ops,
306             // Skip the request, no additional error handling required.
307             lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
308                       "on proc({PROC}) clock state({CLOCKSTATE})",
309                       "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
310                       "CLOCKSTATE", clockState);
311             return;
312         }
313 
314         // If the FFDC is from actual chip-op failure this function will
315         // return true, if the chip-op is not failed but FFDC is present
316         // then create PELs with FFDC but write the dump contents to the
317         // file.
318         if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
319                                  SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP))
320         {
321             lg2::error("Error in collecting dump dump type({TYPE}), "
322                        "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
323                        "position({POSITION}), "
324                        "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
325                        "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
326                        chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
327                        collectFastArray, "ERROR", sbeError);
328             return;
329         }
330     }
331     writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
332 }
333 
writeDumpFile(const std::filesystem::path & path,const uint32_t id,const uint8_t clockState,const uint8_t nodeNum,const std::string & chipName,const uint8_t chipPos,util::DumpDataPtr & dataPtr,const uint32_t len)334 void SbeDumpCollector::writeDumpFile(
335     const std::filesystem::path& path, const uint32_t id,
336     const uint8_t clockState, const uint8_t nodeNum,
337     const std::string& chipName, const uint8_t chipPos,
338     util::DumpDataPtr& dataPtr, const uint32_t len)
339 {
340     using namespace sdbusplus::xyz::openbmc_project::Common::Error;
341     namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
342 
343     // Construct the filename
344     std::ostringstream filenameBuilder;
345     filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
346                     << ".SbeDataClocks"
347                     << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
348                     << std::dec << static_cast<int>(nodeNum) << "." << chipName
349                     << static_cast<int>(chipPos);
350 
351     auto dumpPath = path / filenameBuilder.str();
352 
353     // Attempt to open the file
354     std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
355     if (!outfile)
356     {
357         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
358         using metadata = xyz::openbmc_project::Common::File::Open;
359         // Unable to open the file for writing
360         auto err = errno;
361         lg2::error("Error opening file to write dump, "
362                    "errno({ERRNO}), filepath({FILEPATH})",
363                    "ERRNO", err, "FILEPATH", dumpPath.string());
364 
365         report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
366         // Just return here, so that the dumps collected from other
367         // SBEs can be packaged.
368         return;
369     }
370 
371     // Write to the file
372     try
373     {
374         outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
375 
376         lg2::info("Successfully wrote dump file "
377                   "path=({PATH}) size=({SIZE})",
378                   "PATH", dumpPath.string(), "SIZE", len);
379     }
380     catch (const std::ofstream::failure& oe)
381     {
382         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
383         using metadata = xyz::openbmc_project::Common::File::Write;
384 
385         lg2::error(
386             "Failed to write to dump file, "
387             "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
388             "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
389             dumpPath.string());
390         report<Write>(metadata::ERRNO(oe.code().value()),
391                       metadata::PATH(dumpPath.c_str()));
392         // Just return here so dumps collected from other SBEs can be
393         // packaged.
394     }
395 }
396 
executeThreadStop(struct pdbg_target * target)397 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target)
398 {
399     try
400     {
401         openpower::phal::sbe::threadStopProc(target);
402         return true;
403     }
404     catch (const openpower::phal::sbeError_t& sbeError)
405     {
406         uint64_t chipPos = pdbg_target_index(target);
407         if (sbeError.errType() ==
408             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
409         {
410             lg2::info("SBE is not ready to accept chip-op: Skipping "
411                       "stop instruction on proc-({POSITION}) error({ERROR}) ",
412                       "POSITION", chipPos, "ERROR", sbeError);
413             return false; // Do not include the target for dump collection
414         }
415 
416         lg2::error("Stop instructions failed on "
417                    "proc-({POSITION}) error({ERROR}) ",
418                    "POSITION", chipPos, "ERROR", sbeError);
419 
420         logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
421                              SBEFIFO_CMD_CLASS_INSTRUCTION,
422                              SBEFIFO_CMD_CONTROL_INSN);
423         // For TIMEOUT, log the error and skip adding the processor for dump
424         // collection
425         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
426         {
427             return false;
428         }
429     }
430     // Include the target for dump collection for SBE_CMD_FAILED or any other
431     // non-critical errors
432     return true;
433 }
434 
435 } // namespace openpower::dump::sbe_chipop
436