xref: /openbmc/openpower-debug-collector/dump/sbe_dump_collector.hpp (revision fc4f223cb730ae868732532c0bf85a5b62b84852)
1 #pragma once
2 
3 extern "C"
4 {
5 #include <libpdbg.h>
6 #include <libpdbg_sbe.h>
7 }
8 
9 #include "dump_utils.hpp"
10 #include "sbe_consts.hpp"
11 #include "sbe_type.hpp"
12 
13 #include <phal_exception.H>
14 
15 #include <cstdint>
16 #include <filesystem>
17 #include <future>
18 #include <vector>
19 
20 namespace openpower::dump::sbe_chipop
21 {
22 
23 using TargetMap =
24     std::map<struct pdbg_target*, std::vector<struct pdbg_target*>>;
25 
26 /**
27  * @class SbeDumpCollector
28  * @brief Manages the collection of dumps from SBEs on failure.
29  *
30  * This class provides functionalities to orchestrate the collection of
31  * diagnostic dumps from Self Boot Engines across multiple processors
32  * in response to failures or for diagnostic purposes.
33  */
34 class SbeDumpCollector
35 {
36   public:
37     /**
38      * @brief Constructs a new SbeDumpCollector object.
39      */
40     SbeDumpCollector() = default;
41 
42     /**
43      * @brief Destroys the SbeDumpCollector object.
44      */
45     ~SbeDumpCollector() = default;
46 
47     /**
48      * @brief Drives all type of dump collection process from SBEs.
49      *
50      * Triggers SBE, Hardware/Hostboot dump collection process from SBEs.
51      * Internally calls private method collectHWHBDump(for Hardware/Hostboot
52      * dump) or collectSBEDump(for SBE dump) based on the parameter type's value
53      *
54      * @param type The type of dump which needs to be collected.
55      * @param id ID of the collected dump.
56      * @param failingUnit ID of the failing unit from which the dump is
57      * collected.
58      * @param path Path where the collected dump will be stored.
59      */
60     void collectDump(uint8_t type, uint32_t id, uint32_t failingUnit,
61                      const std::filesystem::path& path);
62 
63   private:
64     /**
65      * @brief Orchestrates the collection of dumps from all available SBEs.
66      *
67      * Initiates the process of collecting diagnostic dumps from SBEs. This
68      * involves identifying available processors, initiating the dump
69      * collection process, and managing the collected dump files.
70      *
71      * @param type The type of dump to collect.
72      * @param id A unique identifier for the dump collection operation.
73      * @param failingUnit The identifier of the failing unit prompting the dump
74      * collection.
75      * @param path The filesystem path where collected dumps should be stored.
76      */
77     void collectHWHBDump(uint8_t type, uint32_t id, uint64_t failingUnit,
78                          const std::filesystem::path& path);
79 
80     /**
81      * @brief Execute HWPs to collect SBE dump.
82      *
83      * @param[in] id Id of the dump.
84      * @param[in] failingUnit Id of proc containing failing SBE.
85      * @param[in] dumpPath Path to stored the dump files.
86      * @param[in] sbeTypeId ID for SBE type i.e.; Odyssey or normal memory chip
87      *                                             0xA-->Normal SBE type,
88      * 0xB-->Odyssey SBE type Exceptions: PDBG_INIT_FAIL for any pdbg init
89      * related failure.
90      */
91     void collectSBEDump(uint32_t id, uint32_t failingUnit,
92                         const std::filesystem::path& dumpPath,
93                         const int sbeTypeId);
94 
95     /**
96      * @brief Collects a dump from a single SBE.
97      *
98      * Executes the low-level operations required to collect a diagnostic
99      * dump from the specified SBE.
100      *
101      * @param chip A pointer to the pdbg_target structure representing the SBE.
102      * @param path The filesystem path where the dump should be stored.
103      * @param id The unique identifier for this dump collection operation.
104      * @param type The type of dump to collect.
105      * @param clockState The clock state of the SBE during dump collection.
106      * @param failingUnit The identifier of the failing unit.
107      */
108     void collectDumpFromSBE(struct pdbg_target* chip,
109                             const std::filesystem::path& path, uint32_t id,
110                             uint8_t type, uint8_t clockState,
111                             uint64_t failingUnit);
112 
113     /**
114      * @brief Initializes the PDBG library.
115      *
116      * Prepares the PDBG library for interacting with processor targets. This
117      * must be called before any PDBG-related operations are performed.
118      */
119     void initializePdbg();
120 
121     /**
122      * @brief Launches asynchronous dump collection tasks for a set of targets.
123      *
124      * This method initiates the dump collection process asynchronously for each
125      * target provided in the `targets` vector. It launches a separate
126      * asynchronous task for each target, where each task calls
127      * `collectDumpFromSBE` with the specified parameters, including the clock
128      * state.
129      *
130      * @param type The type of the dump to collect. This could be a hardware
131      * dump, software dump, etc., as defined by the SBE dump type enumeration.
132      * @param id A unique identifier for the dump collection operation. This ID
133      * is used to tag the collected dump for identification.
134      * @param path The filesystem path where the collected dumps should be
135      * stored. Each dump file will be stored under this directory.
136      * @param failingUnit The identifier of the unit or component that is
137      * failing or suspected to be the cause of the issue prompting the dump
138      * collection. This is used for diagnostic purposes.
139      * @param cstate The clock state during the dump collection. This parameter
140      *               dictates whether the dump should be collected with the
141      * clocks running (SBE_CLOCK_ON) or with the clocks stopped (SBE_CLOCK_OFF).
142      * @param targetMap A map of `pdbg_target*` representing the targets from
143      * which dumps should be collected. The key is the proc target with the
144      * list of ocmb targets associated with the proc.
145      *
146      * @return A vector of `std::future<void>` objects. Each future represents
147      * the completion state of an asynchronous dump collection task. The caller
148      *         can wait on these futures to determine when all dump collection
149      * tasks have completed. Exceptions thrown by the asynchronous tasks are
150      * captured by the futures and can be rethrown when the futures are
151      * accessed.
152      */
153     std::vector<std::future<void>> spawnDumpCollectionProcesses(
154         uint8_t type, uint32_t id, const std::filesystem::path& path,
155         uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap);
156 
157     /** @brief This function creates the new dump file in dump file name
158      * format and then writes the contents into it.
159      *  @param path - Path to dump file
160      *  @param id - A unique id assigned to dump to be collected
161      *  @param clockState - Clock state, ON or Off
162      *  @param nodeNum - Node containing the chip
163      *  @param chipName - Name of the chip
164      *  @param chipPos - Chip position of the failing unit
165      *  @param dataPtr - Content to write to file
166      *  @param len - Length of the content
167      */
168     void writeDumpFile(const std::filesystem::path& path, const uint32_t id,
169                        const uint8_t clockState, const uint8_t nodeNum,
170                        const std::string& chipName, const uint8_t chipPos,
171                        util::DumpDataPtr& dataPtr, const uint32_t len);
172 
173     /**
174      * @brief Determines if fastarray collection is needed based on dump type
175      * and unit.
176      *
177      * @param clockState The current state of the clock.
178      * @param type The type of the dump being collected.
179      * @param failingUnit The ID of the failing unit.
180      * @param chipPos The position of the chip for which the dump is being
181      * collected.
182      *
183      * @return uint8_t - Returns 1 if fastarray collection is needed, 0
184      * otherwise.
185      */
checkFastarrayCollectionNeeded(const uint8_t clockState,const uint8_t type,uint64_t failingUnit,const uint8_t chipPos) const186     inline uint8_t checkFastarrayCollectionNeeded(
187         const uint8_t clockState, const uint8_t type, uint64_t failingUnit,
188         const uint8_t chipPos) const
189     {
190         using namespace openpower::dump::SBE;
191 
192         return (clockState == SBE_CLOCK_OFF &&
193                 (type == SBE_DUMP_TYPE_HOSTBOOT ||
194                  (type == SBE_DUMP_TYPE_HARDWARE && chipPos == failingUnit)))
195                    ? 1
196                    : 0;
197     }
198 
199     /**
200      * Logs an error and creates a PEL for SBE chip-op failures.
201      *
202      * @param sbeError - An error object encapsulating details about the SBE
203      * error.
204      * @param chipPos - The position of the chip where the error occurred.
205      * @param sbeType - The type of SBE, used to determine the event log
206      * message.
207      * @param cmdClass - The command class associated with the SBE operation.
208      * @param cmdType - The specific type of command within the command class.
209      * @param path - Dump collection path.
210      *
211      */
212     bool logErrorAndCreatePEL(const openpower::phal::sbeError_t& sbeError,
213                               uint64_t chipPos, SBETypes sbeType,
214                               uint32_t cmdClass, uint32_t cmdType,
215                               const std::filesystem::path& path);
216 
217     /**
218      * Determines the type of SBE for a given chip target.
219      *
220      * @param chip - A pointer to a pdbg_target structure representing the chip.
221      * @return The SBE type for the given chip target.
222      */
getSBEType(struct pdbg_target * chip)223     inline SBETypes getSBEType([[maybe_unused]] struct pdbg_target* chip)
224     {
225         if (is_ody_ocmb_chip(chip))
226         {
227             return SBETypes::OCMB;
228         }
229         return SBETypes::PROC;
230     }
231 
232     /**
233      * @brief Executes thread stop on a processor target
234      *
235      * If the Self Boot Engine (SBE) is not ready to accept chip operations
236      * (chip-ops), it logs the condition and excludes the processor from the
237      * dump collection process. For critical errors, such as a timeout during
238      * the stop operation, it logs the error and again excludes the processor.
239      * In case of SBE command failure or non-critical errors, it continues with
240      * the dump collection process.
241      *
242      * @param target Pointer to the pdbg target structure representing the
243      *               processor to perform the thread stop on.
244      * @param path Dump collection path
245      * @return true If the thread stop was successful or in case of non-critical
246      *              errors where dump collection can proceed.
247      * @return false If the SBE is not ready for chip-ops or in case of critical
248      *               errors like timeouts, indicating the processor should be
249      *               excluded from the dump collection.
250      */
251     bool executeThreadStop(struct pdbg_target* target,
252                            const std::filesystem::path& path);
253 
254     /**
255      * @brief Add Failure log information to info.yaml file
256      * @param logId - Error Log Id
257      * @param src - Reason Code of PEL
258      * @param chipName - Resource Name
259      * @param chipPos - Resource number
260      * @param path - Dump collection path
261      */
262     void addLogDataToDump(uint32_t logId, std::string src, std::string chipName,
263                           uint64_t chipPos, const std::filesystem::path& path);
264 };
265 } // namespace openpower::dump::sbe_chipop
266