1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6 
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11 
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15 
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22 
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <stdexcept>
28 
29 namespace openpower::dump::sbe_chipop
30 {
31 
32 using namespace phosphor::logging;
33 using namespace openpower::dump::SBE;
34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
35 
collectDump(uint8_t type,uint32_t id,uint64_t failingUnit,const std::filesystem::path & path)36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
37                                    uint64_t failingUnit,
38                                    const std::filesystem::path& path)
39 {
40     lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
41                "failingUnit:{FAILINGUNIT}, path:{PATH}",
42                "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
43                path.string());
44 
45     initializePdbg();
46 
47     TargetMap targets;
48 
49     struct pdbg_target* target = nullptr;
50     pdbg_for_each_class_target("proc", target)
51     {
52         if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
53             !openpower::phal::pdbg::isTgtFunctional(target))
54         {
55             continue;
56         }
57 
58         bool includeTarget = true;
59         // if the dump type is hostboot then call stop instructions
60         if (type == SBE_DUMP_TYPE_HOSTBOOT)
61         {
62             includeTarget = executeThreadStop(target);
63         }
64         if (includeTarget)
65         {
66             targets[target] = std::vector<struct pdbg_target*>();
67 
68             // Hardware dump needs OCMB data if present
69             if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
70             {
71                 struct pdbg_target* ocmbTarget;
72                 pdbg_for_each_target("ocmb", target, ocmbTarget)
73                 {
74                     if (!is_ody_ocmb_chip(ocmbTarget))
75                     {
76                         continue;
77                     }
78 
79                     if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
80                     {
81                         continue;
82                     }
83 
84                     if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
85                     {
86                         continue;
87                     }
88                     targets[target].push_back(ocmbTarget);
89                 }
90             }
91         }
92     }
93 
94     std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
95     for (auto cstate : clockStates)
96     {
97         // Skip collection for performance dump if clock state is not ON
98         if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
99         {
100             continue;
101         }
102         auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
103                                                     cstate, targets);
104 
105         // Wait for all asynchronous tasks to complete
106         for (auto& future : futures)
107         {
108             try
109             {
110                 future.wait();
111             }
112             catch (const std::exception& e)
113             {
114                 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
115                            "ERROR", e);
116             }
117         }
118         lg2::info(
119             "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
120             "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
121             "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
122             failingUnit, "PATH", path.string());
123     }
124     if (std::filesystem::is_empty(path))
125     {
126         lg2::error("Failed to collect the dump");
127         throw std::runtime_error("Failed to collect the dump");
128     }
129     lg2::info("Dump collection completed");
130 }
131 
initializePdbg()132 void SbeDumpCollector::initializePdbg()
133 {
134     openpower::phal::pdbg::init();
135 }
136 
spawnDumpCollectionProcesses(uint8_t type,uint32_t id,const std::filesystem::path & path,uint64_t failingUnit,uint8_t cstate,const TargetMap & targetMap)137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
138     uint8_t type, uint32_t id, const std::filesystem::path& path,
139     uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
140 {
141     std::vector<std::future<void>> futures;
142 
143     for (const auto& [procTarget, ocmbTargets] : targetMap)
144     {
145         auto future = std::async(std::launch::async,
146                                  [this, procTarget, ocmbTargets, path, id, type,
147                                   cstate, failingUnit]() {
148             try
149             {
150                 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
151                                          failingUnit);
152             }
153             catch (const std::exception& e)
154             {
155                 lg2::error(
156                     "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
157                     "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
158             }
159 
160             // Collect OCMBs only with clock on
161             if (cstate == SBE_CLOCK_ON)
162             {
163                 // Handle OCMBs serially after handling the proc
164                 for (auto ocmbTarget : ocmbTargets)
165                 {
166                     try
167                     {
168                         this->collectDumpFromSBE(ocmbTarget, path, id, type,
169                                                  cstate, failingUnit);
170                     }
171                     catch (const std::exception& e)
172                     {
173                         lg2::error(
174                             "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
175                             "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
176                             e);
177                     }
178                 }
179             }
180         });
181 
182         futures.push_back(std::move(future));
183     }
184 
185     return futures;
186 }
187 
logErrorAndCreatePEL(const openpower::phal::sbeError_t & sbeError,uint64_t chipPos,SBETypes sbeType,uint32_t cmdClass,uint32_t cmdType)188 bool SbeDumpCollector::logErrorAndCreatePEL(
189     const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
190     SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType)
191 {
192     namespace fs = std::filesystem;
193 
194     std::string chipName;
195     std::string event;
196     bool dumpIsRequired = false;
197     bool isDumpFailure = true;
198     try
199     {
200         chipName = sbeTypeAttributes.at(sbeType).chipName;
201         event = sbeTypeAttributes.at(sbeType).chipOpFailure;
202 
203         lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
204                   chipPos);
205 
206         // Common FFDC data
207         openpower::dump::pel::FFDCData pelAdditionalData = {
208             {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
209 
210         if (sbeType == SBETypes::OCMB)
211         {
212             pelAdditionalData.emplace_back(
213                 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
214         }
215 
216         // Check the error type
217         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
218         {
219             event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
220             dumpIsRequired = true;
221             // For timeout, we do not expect any FFDC packets
222         }
223         else if (sbeError.errType() ==
224                  openpower::phal::exception::SBE_FFDC_NO_DATA)
225         {
226             // We will create a PEL without FFDC with the common information we
227             // added
228             lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
229                        "CHIP", chipName, "POSITION", chipPos);
230             event = sbeTypeAttributes.at(sbeType).noFfdc;
231         }
232         else
233         {
234             if (sbeError.errType() ==
235                 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
236             {
237                 lg2::info(
238                     "FFDC Not related to chip-op present {CHIP} {POSITION}",
239                     "CHIP", chipName, "POSITION", chipPos);
240                 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
241                 isDumpFailure = false;
242             }
243             else
244             {
245                 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
246                            "POSITION", chipPos);
247             }
248             // Processor FFDC Packets
249             openpower::dump::pel::processFFDCPackets(sbeError, event,
250                                                      pelAdditionalData);
251         }
252 
253         // If dump is required, request it
254         if (dumpIsRequired)
255         {
256             auto logId = openpower::dump::pel::createSbeErrorPEL(
257                 event, sbeError, pelAdditionalData);
258             util::requestSBEDump(chipPos, logId, sbeType);
259         }
260     }
261     catch (const std::out_of_range& e)
262     {
263         lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
264                    sbeType, "ERROR", e);
265     }
266     catch (const std::exception& e)
267     {
268         lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
269                    "position({CHIPPOS}), Error: {ERROR}",
270                    "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
271     }
272 
273     return isDumpFailure;
274 }
275 
collectDumpFromSBE(struct pdbg_target * chip,const std::filesystem::path & path,uint32_t id,uint8_t type,uint8_t clockState,uint64_t failingUnit)276 void SbeDumpCollector::collectDumpFromSBE(struct pdbg_target* chip,
277                                           const std::filesystem::path& path,
278                                           uint32_t id, uint8_t type,
279                                           uint8_t clockState,
280                                           uint64_t failingUnit)
281 {
282     auto chipPos = pdbg_target_index(chip);
283     SBETypes sbeType = getSBEType(chip);
284     auto chipName = sbeTypeAttributes.at(sbeType).chipName;
285     lg2::info(
286         "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
287         "type({TYPE})  clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
288         "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
289         id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
290 
291     util::DumpDataPtr dataPtr;
292     uint32_t len = 0;
293     uint8_t collectFastArray =
294         checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
295 
296     try
297     {
298         openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
299                                       dataPtr.getPtr(), &len);
300     }
301     catch (const openpower::phal::sbeError_t& sbeError)
302     {
303         if (sbeError.errType() ==
304             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
305         {
306             // SBE is not ready to accept chip-ops,
307             // Skip the request, no additional error handling required.
308             lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
309                       "on proc({PROC}) clock state({CLOCKSTATE})",
310                       "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
311                       "CLOCKSTATE", clockState);
312             return;
313         }
314 
315         // If the FFDC is from actual chip-op failure this function will
316         // return true, if the chip-op is not failed but FFDC is present
317         // then create PELs with FFDC but write the dump contents to the
318         // file.
319         if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
320                                  SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP))
321         {
322             lg2::error("Error in collecting dump dump type({TYPE}), "
323                        "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
324                        "position({POSITION}), "
325                        "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
326                        "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
327                        chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
328                        collectFastArray, "ERROR", sbeError);
329             return;
330         }
331     }
332     writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
333 }
334 
writeDumpFile(const std::filesystem::path & path,const uint32_t id,const uint8_t clockState,const uint8_t nodeNum,const std::string & chipName,const uint8_t chipPos,util::DumpDataPtr & dataPtr,const uint32_t len)335 void SbeDumpCollector::writeDumpFile(
336     const std::filesystem::path& path, const uint32_t id,
337     const uint8_t clockState, const uint8_t nodeNum,
338     const std::string& chipName, const uint8_t chipPos,
339     util::DumpDataPtr& dataPtr, const uint32_t len)
340 {
341     using namespace sdbusplus::xyz::openbmc_project::Common::Error;
342     namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
343 
344     // Construct the filename
345     std::ostringstream filenameBuilder;
346     filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
347                     << ".SbeDataClocks"
348                     << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
349                     << std::dec << static_cast<int>(nodeNum) << "." << chipName
350                     << static_cast<int>(chipPos);
351 
352     auto dumpPath = path / filenameBuilder.str();
353 
354     // Attempt to open the file
355     std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
356     if (!outfile)
357     {
358         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
359         using metadata = xyz::openbmc_project::Common::File::Open;
360         // Unable to open the file for writing
361         auto err = errno;
362         lg2::error("Error opening file to write dump, "
363                    "errno({ERRNO}), filepath({FILEPATH})",
364                    "ERRNO", err, "FILEPATH", dumpPath.string());
365 
366         report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
367         // Just return here, so that the dumps collected from other
368         // SBEs can be packaged.
369         return;
370     }
371 
372     // Write to the file
373     try
374     {
375         outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
376 
377         lg2::info("Successfully wrote dump file "
378                   "path=({PATH}) size=({SIZE})",
379                   "PATH", dumpPath.string(), "SIZE", len);
380     }
381     catch (const std::ofstream::failure& oe)
382     {
383         using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
384         using metadata = xyz::openbmc_project::Common::File::Write;
385 
386         lg2::error(
387             "Failed to write to dump file, "
388             "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
389             "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
390             dumpPath.string());
391         report<Write>(metadata::ERRNO(oe.code().value()),
392                       metadata::PATH(dumpPath.c_str()));
393         // Just return here so dumps collected from other SBEs can be
394         // packaged.
395     }
396 }
397 
executeThreadStop(struct pdbg_target * target)398 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target)
399 {
400     try
401     {
402         openpower::phal::sbe::threadStopProc(target);
403         return true;
404     }
405     catch (const openpower::phal::sbeError_t& sbeError)
406     {
407         uint64_t chipPos = pdbg_target_index(target);
408         if (sbeError.errType() ==
409             openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
410         {
411             lg2::info("SBE is not ready to accept chip-op: Skipping "
412                       "stop instruction on proc-({POSITION}) error({ERROR}) ",
413                       "POSITION", chipPos, "ERROR", sbeError);
414             return false; // Do not include the target for dump collection
415         }
416 
417         lg2::error("Stop instructions failed on "
418                    "proc-({POSITION}) error({ERROR}) ",
419                    "POSITION", chipPos, "ERROR", sbeError);
420 
421         logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
422                              SBEFIFO_CMD_CLASS_INSTRUCTION,
423                              SBEFIFO_CMD_CONTROL_INSN);
424         // For TIMEOUT, log the error and skip adding the processor for dump
425         // collection
426         if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
427         {
428             return false;
429         }
430     }
431     // Include the target for dump collection for SBE_CMD_FAILED or any other
432     // non-critical errors
433     return true;
434 }
435 
436 } // namespace openpower::dump::sbe_chipop
437