1 #pragma once 2 3 extern "C" 4 { 5 #include <libpdbg.h> 6 #include <libpdbg_sbe.h> 7 } 8 9 #include "dump_utils.hpp" 10 #include "sbe_consts.hpp" 11 #include "sbe_type.hpp" 12 13 #include <phal_exception.H> 14 15 #include <cstdint> 16 #include <filesystem> 17 #include <future> 18 #include <vector> 19 20 namespace openpower::dump::sbe_chipop 21 { 22 23 using TargetMap = 24 std::map<struct pdbg_target*, std::vector<struct pdbg_target*>>; 25 26 /** 27 * @class SbeDumpCollector 28 * @brief Manages the collection of dumps from SBEs on failure. 29 * 30 * This class provides functionalities to orchestrate the collection of 31 * diagnostic dumps from Self Boot Engines across multiple processors 32 * in response to failures or for diagnostic purposes. 33 */ 34 class SbeDumpCollector 35 { 36 public: 37 /** 38 * @brief Constructs a new SbeDumpCollector object. 39 */ 40 SbeDumpCollector() = default; 41 42 /** 43 * @brief Destroys the SbeDumpCollector object. 44 */ 45 ~SbeDumpCollector() = default; 46 47 /** 48 * @brief Orchestrates the collection of dumps from all available SBEs. 49 * 50 * Initiates the process of collecting diagnostic dumps from SBEs. This 51 * involves identifying available processors, initiating the dump 52 * collection process, and managing the collected dump files. 53 * 54 * @param type The type of dump to collect. 55 * @param id A unique identifier for the dump collection operation. 56 * @param failingUnit The identifier of the failing unit prompting the dump 57 * collection. 58 * @param path The filesystem path where collected dumps should be stored. 59 */ 60 void collectDump(uint8_t type, uint32_t id, uint64_t failingUnit, 61 const std::filesystem::path& path); 62 63 private: 64 /** 65 * @brief Collects a dump from a single SBE. 66 * 67 * Executes the low-level operations required to collect a diagnostic 68 * dump from the specified SBE. 69 * 70 * @param chip A pointer to the pdbg_target structure representing the SBE. 71 * @param path The filesystem path where the dump should be stored. 72 * @param id The unique identifier for this dump collection operation. 73 * @param type The type of dump to collect. 74 * @param clockState The clock state of the SBE during dump collection. 75 * @param failingUnit The identifier of the failing unit. 76 */ 77 void collectDumpFromSBE(struct pdbg_target* chip, 78 const std::filesystem::path& path, uint32_t id, 79 uint8_t type, uint8_t clockState, 80 uint64_t failingUnit); 81 82 /** 83 * @brief Initializes the PDBG library. 84 * 85 * Prepares the PDBG library for interacting with processor targets. This 86 * must be called before any PDBG-related operations are performed. 87 */ 88 void initializePdbg(); 89 90 /** 91 * @brief Launches asynchronous dump collection tasks for a set of targets. 92 * 93 * This method initiates the dump collection process asynchronously for each 94 * target provided in the `targets` vector. It launches a separate 95 * asynchronous task for each target, where each task calls 96 * `collectDumpFromSBE` with the specified parameters, including the clock 97 * state. 98 * 99 * @param type The type of the dump to collect. This could be a hardware 100 * dump, software dump, etc., as defined by the SBE dump type enumeration. 101 * @param id A unique identifier for the dump collection operation. This ID 102 * is used to tag the collected dump for identification. 103 * @param path The filesystem path where the collected dumps should be 104 * stored. Each dump file will be stored under this directory. 105 * @param failingUnit The identifier of the unit or component that is 106 * failing or suspected to be the cause of the issue prompting the dump 107 * collection. This is used for diagnostic purposes. 108 * @param cstate The clock state during the dump collection. This parameter 109 * dictates whether the dump should be collected with the 110 * clocks running (SBE_CLOCK_ON) or with the clocks stopped (SBE_CLOCK_OFF). 111 * @param targetMap A map of `pdbg_target*` representing the targets from 112 * which dumps should be collected. The key is the proc target with the 113 * list of ocmb targets associated with the proc. 114 * 115 * @return A vector of `std::future<void>` objects. Each future represents 116 * the completion state of an asynchronous dump collection task. The caller 117 * can wait on these futures to determine when all dump collection 118 * tasks have completed. Exceptions thrown by the asynchronous tasks are 119 * captured by the futures and can be rethrown when the futures are 120 * accessed. 121 */ 122 std::vector<std::future<void>> spawnDumpCollectionProcesses( 123 uint8_t type, uint32_t id, const std::filesystem::path& path, 124 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap); 125 126 /** @brief This function creates the new dump file in dump file name 127 * format and then writes the contents into it. 128 * @param path - Path to dump file 129 * @param id - A unique id assigned to dump to be collected 130 * @param clockState - Clock state, ON or Off 131 * @param nodeNum - Node containing the chip 132 * @param chipName - Name of the chip 133 * @param chipPos - Chip position of the failing unit 134 * @param dataPtr - Content to write to file 135 * @param len - Length of the content 136 */ 137 void writeDumpFile(const std::filesystem::path& path, const uint32_t id, 138 const uint8_t clockState, const uint8_t nodeNum, 139 const std::string& chipName, const uint8_t chipPos, 140 util::DumpDataPtr& dataPtr, const uint32_t len); 141 142 /** 143 * @brief Determines if fastarray collection is needed based on dump type 144 * and unit. 145 * 146 * @param clockState The current state of the clock. 147 * @param type The type of the dump being collected. 148 * @param failingUnit The ID of the failing unit. 149 * @param chipPos The position of the chip for which the dump is being 150 * collected. 151 * 152 * @return uint8_t - Returns 1 if fastarray collection is needed, 0 153 * otherwise. 154 */ 155 inline uint8_t checkFastarrayCollectionNeeded( 156 const uint8_t clockState, const uint8_t type, uint64_t failingUnit, 157 const uint8_t chipPos) const 158 { 159 using namespace openpower::dump::SBE; 160 161 return (clockState == SBE_CLOCK_OFF && 162 (type == SBE_DUMP_TYPE_HOSTBOOT || 163 (type == SBE_DUMP_TYPE_HARDWARE && chipPos == failingUnit))) 164 ? 1 165 : 0; 166 } 167 168 /** 169 * Logs an error and creates a PEL for SBE chip-op failures. 170 * 171 * @param sbeError - An error object encapsulating details about the SBE 172 * error. 173 * @param chipPos - The position of the chip where the error occurred. 174 * @param sbeType - The type of SBE, used to determine the event log 175 * message. 176 * @param cmdClass - The command class associated with the SBE operation. 177 * @param cmdType - The specific type of command within the command class. 178 * 179 */ 180 bool logErrorAndCreatePEL(const openpower::phal::sbeError_t& sbeError, 181 uint64_t chipPos, SBETypes sbeType, 182 uint32_t cmdClass, uint32_t cmdType); 183 184 /** 185 * Determines the type of SBE for a given chip target. 186 * 187 * @param chip - A pointer to a pdbg_target structure representing the chip. 188 * @return The SBE type for the given chip target. 189 */ 190 inline SBETypes getSBEType([[maybe_unused]] struct pdbg_target* chip) 191 { 192 if (is_ody_ocmb_chip(chip)) 193 { 194 return SBETypes::OCMB; 195 } 196 return SBETypes::PROC; 197 } 198 199 /** 200 * @brief Executes thread stop on a processor target 201 * 202 * If the Self Boot Engine (SBE) is not ready to accept chip operations 203 * (chip-ops), it logs the condition and excludes the processor from the 204 * dump collection process. For critical errors, such as a timeout during 205 * the stop operation, it logs the error and again excludes the processor. 206 * In case of SBE command failure or non-critical errors, it continues with 207 * the dump collection process. 208 * 209 * @param target Pointer to the pdbg target structure representing the 210 * processor to perform the thread stop on. 211 * @return true If the thread stop was successful or in case of non-critical 212 * errors where dump collection can proceed. 213 * @return false If the SBE is not ready for chip-ops or in case of critical 214 * errors like timeouts, indicating the processor should be 215 * excluded from the dump collection. 216 */ 217 bool executeThreadStop(struct pdbg_target* target); 218 }; 219 220 } // namespace openpower::dump::sbe_chipop 221