1 extern "C"
2 {
3 #include <libpdbg.h>
4 #include <libpdbg_sbe.h>
5 }
6
7 #include "create_pel.hpp"
8 #include "sbe_consts.hpp"
9 #include "sbe_dump_collector.hpp"
10 #include "sbe_type.hpp"
11
12 #include <ekb/hwpf/fapi2/include/target_types.H>
13 #include <libphal.H>
14 #include <phal_exception.H>
15
16 #include <phosphor-logging/elog-errors.hpp>
17 #include <phosphor-logging/lg2.hpp>
18 #include <phosphor-logging/log.hpp>
19 #include <sbe_consts.hpp>
20 #include <xyz/openbmc_project/Common/File/error.hpp>
21 #include <xyz/openbmc_project/Common/error.hpp>
22
23 #include <cstdint>
24 #include <filesystem>
25 #include <format>
26 #include <fstream>
27 #include <stdexcept>
28
29 namespace openpower::dump::sbe_chipop
30 {
31
32 using namespace phosphor::logging;
33 using namespace openpower::dump::SBE;
34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
35
collectDump(uint8_t type,uint32_t id,uint64_t failingUnit,const std::filesystem::path & path)36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id,
37 uint64_t failingUnit,
38 const std::filesystem::path& path)
39 {
40 lg2::error("Starting dump collection: type:{TYPE} id:{ID} "
41 "failingUnit:{FAILINGUNIT}, path:{PATH}",
42 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH",
43 path.string());
44
45 initializePdbg();
46
47 TargetMap targets;
48
49 struct pdbg_target* target = nullptr;
50 pdbg_for_each_class_target("proc", target)
51 {
52 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED ||
53 !openpower::phal::pdbg::isTgtFunctional(target))
54 {
55 continue;
56 }
57
58 bool includeTarget = true;
59 // if the dump type is hostboot then call stop instructions
60 if (type == SBE_DUMP_TYPE_HOSTBOOT)
61 {
62 includeTarget = executeThreadStop(target);
63 }
64 if (includeTarget)
65 {
66 targets[target] = std::vector<struct pdbg_target*>();
67
68 // Hardware dump needs OCMB data if present
69 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE)
70 {
71 struct pdbg_target* ocmbTarget;
72 pdbg_for_each_target("ocmb", target, ocmbTarget)
73 {
74 if (!is_ody_ocmb_chip(ocmbTarget))
75 {
76 continue;
77 }
78
79 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED)
80 {
81 continue;
82 }
83
84 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget))
85 {
86 continue;
87 }
88 targets[target].push_back(ocmbTarget);
89 }
90 }
91 }
92 }
93
94 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF};
95 for (auto cstate : clockStates)
96 {
97 // Skip collection for performance dump if clock state is not ON
98 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON)
99 {
100 continue;
101 }
102 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit,
103 cstate, targets);
104
105 // Wait for all asynchronous tasks to complete
106 for (auto& future : futures)
107 {
108 try
109 {
110 future.wait();
111 }
112 catch (const std::exception& e)
113 {
114 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})",
115 "ERROR", e);
116 }
117 }
118 lg2::info(
119 "Dump collection completed for clock state({CSTATE}): type({TYPE}) "
120 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})",
121 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT",
122 failingUnit, "PATH", path.string());
123 }
124 if (std::filesystem::is_empty(path))
125 {
126 lg2::error("Failed to collect the dump");
127 throw std::runtime_error("Failed to collect the dump");
128 }
129 lg2::info("Dump collection completed");
130 }
131
initializePdbg()132 void SbeDumpCollector::initializePdbg()
133 {
134 openpower::phal::pdbg::init();
135 }
136
spawnDumpCollectionProcesses(uint8_t type,uint32_t id,const std::filesystem::path & path,uint64_t failingUnit,uint8_t cstate,const TargetMap & targetMap)137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses(
138 uint8_t type, uint32_t id, const std::filesystem::path& path,
139 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap)
140 {
141 std::vector<std::future<void>> futures;
142
143 for (const auto& [procTarget, ocmbTargets] : targetMap)
144 {
145 auto future = std::async(std::launch::async, [this, procTarget,
146 ocmbTargets, path, id,
147 type, cstate,
148 failingUnit]() {
149 try
150 {
151 this->collectDumpFromSBE(procTarget, path, id, type, cstate,
152 failingUnit);
153 }
154 catch (const std::exception& e)
155 {
156 lg2::error(
157 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}",
158 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e);
159 }
160
161 // Collect OCMBs only with clock on
162 if (cstate == SBE_CLOCK_ON)
163 {
164 // Handle OCMBs serially after handling the proc
165 for (auto ocmbTarget : ocmbTargets)
166 {
167 try
168 {
169 this->collectDumpFromSBE(ocmbTarget, path, id, type,
170 cstate, failingUnit);
171 }
172 catch (const std::exception& e)
173 {
174 lg2::error(
175 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}",
176 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR",
177 e);
178 }
179 }
180 }
181 });
182
183 futures.push_back(std::move(future));
184 }
185
186 return futures;
187 }
188
logErrorAndCreatePEL(const openpower::phal::sbeError_t & sbeError,uint64_t chipPos,SBETypes sbeType,uint32_t cmdClass,uint32_t cmdType)189 bool SbeDumpCollector::logErrorAndCreatePEL(
190 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos,
191 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType)
192 {
193 namespace fs = std::filesystem;
194
195 std::string chipName;
196 std::string event;
197 bool dumpIsRequired = false;
198 bool isDumpFailure = true;
199 try
200 {
201 chipName = sbeTypeAttributes.at(sbeType).chipName;
202 event = sbeTypeAttributes.at(sbeType).chipOpFailure;
203
204 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION",
205 chipPos);
206
207 // Common FFDC data
208 openpower::dump::pel::FFDCData pelAdditionalData = {
209 {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}};
210
211 if (sbeType == SBETypes::OCMB)
212 {
213 pelAdditionalData.emplace_back(
214 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP));
215 }
216
217 // Check the error type
218 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
219 {
220 event = sbeTypeAttributes.at(sbeType).chipOpTimeout;
221 dumpIsRequired = true;
222 // For timeout, we do not expect any FFDC packets
223 }
224 else if (sbeError.errType() ==
225 openpower::phal::exception::SBE_FFDC_NO_DATA)
226 {
227 // We will create a PEL without FFDC with the common information we
228 // added
229 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}",
230 "CHIP", chipName, "POSITION", chipPos);
231 event = sbeTypeAttributes.at(sbeType).noFfdc;
232 }
233 else
234 {
235 if (sbeError.errType() ==
236 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA)
237 {
238 lg2::info(
239 "FFDC Not related to chip-op present {CHIP} {POSITION}",
240 "CHIP", chipName, "POSITION", chipPos);
241 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData;
242 isDumpFailure = false;
243 }
244 else
245 {
246 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName,
247 "POSITION", chipPos);
248 }
249 // Processor FFDC Packets
250 openpower::dump::pel::processFFDCPackets(sbeError, event,
251 pelAdditionalData);
252 }
253
254 // If dump is required, request it
255 if (dumpIsRequired)
256 {
257 auto logId = openpower::dump::pel::createSbeErrorPEL(
258 event, sbeError, pelAdditionalData);
259 util::requestSBEDump(chipPos, logId, sbeType);
260 }
261 }
262 catch (const std::out_of_range& e)
263 {
264 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE",
265 sbeType, "ERROR", e);
266 }
267 catch (const std::exception& e)
268 {
269 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) "
270 "position({CHIPPOS}), Error: {ERROR}",
271 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e);
272 }
273
274 return isDumpFailure;
275 }
276
collectDumpFromSBE(struct pdbg_target * chip,const std::filesystem::path & path,uint32_t id,uint8_t type,uint8_t clockState,uint64_t failingUnit)277 void SbeDumpCollector::collectDumpFromSBE(
278 struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id,
279 uint8_t type, uint8_t clockState, uint64_t failingUnit)
280 {
281 auto chipPos = pdbg_target_index(chip);
282 SBETypes sbeType = getSBEType(chip);
283 auto chipName = sbeTypeAttributes.at(sbeType).chipName;
284 lg2::info(
285 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) "
286 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})",
287 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID",
288 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit);
289
290 util::DumpDataPtr dataPtr;
291 uint32_t len = 0;
292 uint8_t collectFastArray =
293 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos);
294
295 try
296 {
297 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray,
298 dataPtr.getPtr(), &len);
299 }
300 catch (const openpower::phal::sbeError_t& sbeError)
301 {
302 if (sbeError.errType() ==
303 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
304 {
305 // SBE is not ready to accept chip-ops,
306 // Skip the request, no additional error handling required.
307 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) "
308 "on proc({PROC}) clock state({CLOCKSTATE})",
309 "ERROR", sbeError, "TYPE", type, "PROC", chipPos,
310 "CLOCKSTATE", clockState);
311 return;
312 }
313
314 // If the FFDC is from actual chip-op failure this function will
315 // return true, if the chip-op is not failed but FFDC is present
316 // then create PELs with FFDC but write the dump contents to the
317 // file.
318 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType,
319 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP))
320 {
321 lg2::error("Error in collecting dump dump type({TYPE}), "
322 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) "
323 "position({POSITION}), "
324 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})",
325 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE",
326 chipName, "POSITION", chipPos, "COLLECTFASTARRAY",
327 collectFastArray, "ERROR", sbeError);
328 return;
329 }
330 }
331 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len);
332 }
333
writeDumpFile(const std::filesystem::path & path,const uint32_t id,const uint8_t clockState,const uint8_t nodeNum,const std::string & chipName,const uint8_t chipPos,util::DumpDataPtr & dataPtr,const uint32_t len)334 void SbeDumpCollector::writeDumpFile(
335 const std::filesystem::path& path, const uint32_t id,
336 const uint8_t clockState, const uint8_t nodeNum,
337 const std::string& chipName, const uint8_t chipPos,
338 util::DumpDataPtr& dataPtr, const uint32_t len)
339 {
340 using namespace sdbusplus::xyz::openbmc_project::Common::Error;
341 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error;
342
343 // Construct the filename
344 std::ostringstream filenameBuilder;
345 filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id
346 << ".SbeDataClocks"
347 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node"
348 << std::dec << static_cast<int>(nodeNum) << "." << chipName
349 << static_cast<int>(chipPos);
350
351 auto dumpPath = path / filenameBuilder.str();
352
353 // Attempt to open the file
354 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary);
355 if (!outfile)
356 {
357 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
358 using metadata = xyz::openbmc_project::Common::File::Open;
359 // Unable to open the file for writing
360 auto err = errno;
361 lg2::error("Error opening file to write dump, "
362 "errno({ERRNO}), filepath({FILEPATH})",
363 "ERRNO", err, "FILEPATH", dumpPath.string());
364
365 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str()));
366 // Just return here, so that the dumps collected from other
367 // SBEs can be packaged.
368 return;
369 }
370
371 // Write to the file
372 try
373 {
374 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len);
375
376 lg2::info("Successfully wrote dump file "
377 "path=({PATH}) size=({SIZE})",
378 "PATH", dumpPath.string(), "SIZE", len);
379 }
380 catch (const std::ofstream::failure& oe)
381 {
382 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error;
383 using metadata = xyz::openbmc_project::Common::File::Write;
384
385 lg2::error(
386 "Failed to write to dump file, "
387 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})",
388 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH",
389 dumpPath.string());
390 report<Write>(metadata::ERRNO(oe.code().value()),
391 metadata::PATH(dumpPath.c_str()));
392 // Just return here so dumps collected from other SBEs can be
393 // packaged.
394 }
395 }
396
executeThreadStop(struct pdbg_target * target)397 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target)
398 {
399 try
400 {
401 openpower::phal::sbe::threadStopProc(target);
402 return true;
403 }
404 catch (const openpower::phal::sbeError_t& sbeError)
405 {
406 uint64_t chipPos = pdbg_target_index(target);
407 if (sbeError.errType() ==
408 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED)
409 {
410 lg2::info("SBE is not ready to accept chip-op: Skipping "
411 "stop instruction on proc-({POSITION}) error({ERROR}) ",
412 "POSITION", chipPos, "ERROR", sbeError);
413 return false; // Do not include the target for dump collection
414 }
415
416 lg2::error("Stop instructions failed on "
417 "proc-({POSITION}) error({ERROR}) ",
418 "POSITION", chipPos, "ERROR", sbeError);
419
420 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC,
421 SBEFIFO_CMD_CLASS_INSTRUCTION,
422 SBEFIFO_CMD_CONTROL_INSN);
423 // For TIMEOUT, log the error and skip adding the processor for dump
424 // collection
425 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT)
426 {
427 return false;
428 }
429 }
430 // Include the target for dump collection for SBE_CMD_FAILED or any other
431 // non-critical errors
432 return true;
433 }
434
435 } // namespace openpower::dump::sbe_chipop
436