1 extern "C" 2 { 3 #include <libpdbg.h> 4 #include <libpdbg_sbe.h> 5 } 6 7 #include "create_pel.hpp" 8 #include "sbe_consts.hpp" 9 #include "sbe_dump_collector.hpp" 10 #include "sbe_type.hpp" 11 12 #include <ekb/hwpf/fapi2/include/target_types.H> 13 #include <libphal.H> 14 #include <phal_exception.H> 15 16 #include <phosphor-logging/elog-errors.hpp> 17 #include <phosphor-logging/lg2.hpp> 18 #include <phosphor-logging/log.hpp> 19 #include <sbe_consts.hpp> 20 #include <xyz/openbmc_project/Common/File/error.hpp> 21 #include <xyz/openbmc_project/Common/error.hpp> 22 23 #include <cstdint> 24 #include <filesystem> 25 #include <format> 26 #include <fstream> 27 #include <stdexcept> 28 29 namespace openpower::dump::sbe_chipop 30 { 31 32 using namespace phosphor::logging; 33 using namespace openpower::dump::SBE; 34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 35 36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id, 37 uint64_t failingUnit, 38 const std::filesystem::path& path) 39 { 40 lg2::error("Starting dump collection: type:{TYPE} id:{ID} " 41 "failingUnit:{FAILINGUNIT}, path:{PATH}", 42 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH", 43 path.string()); 44 45 initializePdbg(); 46 47 TargetMap targets; 48 49 struct pdbg_target* target = nullptr; 50 pdbg_for_each_class_target("proc", target) 51 { 52 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED || 53 !openpower::phal::pdbg::isTgtFunctional(target)) 54 { 55 continue; 56 } 57 58 bool includeTarget = true; 59 // if the dump type is hostboot then call stop instructions 60 if (type == SBE_DUMP_TYPE_HOSTBOOT) 61 { 62 includeTarget = executeThreadStop(target); 63 } 64 if (includeTarget) 65 { 66 targets[target] = std::vector<struct pdbg_target*>(); 67 68 // Hardware dump needs OCMB data if present 69 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE) 70 { 71 struct pdbg_target* ocmbTarget; 72 pdbg_for_each_target("ocmb", target, ocmbTarget) 73 { 74 if (!is_ody_ocmb_chip(ocmbTarget)) 75 { 76 continue; 77 } 78 79 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED) 80 { 81 continue; 82 } 83 84 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget)) 85 { 86 continue; 87 } 88 targets[target].push_back(ocmbTarget); 89 } 90 } 91 } 92 } 93 94 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF}; 95 for (auto cstate : clockStates) 96 { 97 // Skip collection for performance dump if clock state is not ON 98 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON) 99 { 100 continue; 101 } 102 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit, 103 cstate, targets); 104 105 // Wait for all asynchronous tasks to complete 106 for (auto& future : futures) 107 { 108 try 109 { 110 future.wait(); 111 } 112 catch (const std::exception& e) 113 { 114 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})", 115 "ERROR", e); 116 } 117 } 118 lg2::info( 119 "Dump collection completed for clock state({CSTATE}): type({TYPE}) " 120 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})", 121 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT", 122 failingUnit, "PATH", path.string()); 123 } 124 if (std::filesystem::is_empty(path)) 125 { 126 lg2::error("Failed to collect the dump"); 127 throw std::runtime_error("Failed to collect the dump"); 128 } 129 lg2::info("Dump collection completed"); 130 } 131 132 void SbeDumpCollector::initializePdbg() 133 { 134 openpower::phal::pdbg::init(); 135 } 136 137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses( 138 uint8_t type, uint32_t id, const std::filesystem::path& path, 139 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap) 140 { 141 std::vector<std::future<void>> futures; 142 143 for (const auto& [procTarget, ocmbTargets] : targetMap) 144 { 145 auto future = std::async(std::launch::async, 146 [this, procTarget, ocmbTargets, path, id, type, 147 cstate, failingUnit]() { 148 try 149 { 150 this->collectDumpFromSBE(procTarget, path, id, type, cstate, 151 failingUnit); 152 } 153 catch (const std::exception& e) 154 { 155 lg2::error( 156 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}", 157 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e); 158 } 159 160 // Collect OCMBs only with clock on 161 if (cstate == SBE_CLOCK_ON) 162 { 163 // Handle OCMBs serially after handling the proc 164 for (auto ocmbTarget : ocmbTargets) 165 { 166 try 167 { 168 this->collectDumpFromSBE(ocmbTarget, path, id, type, 169 cstate, failingUnit); 170 } 171 catch (const std::exception& e) 172 { 173 lg2::error( 174 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}", 175 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR", 176 e); 177 } 178 } 179 } 180 }); 181 182 futures.push_back(std::move(future)); 183 } 184 185 return futures; 186 } 187 188 bool SbeDumpCollector::logErrorAndCreatePEL( 189 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos, 190 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType) 191 { 192 namespace fs = std::filesystem; 193 194 std::string chipName; 195 std::string event; 196 bool dumpIsRequired = false; 197 bool isDumpFailure = true; 198 try 199 { 200 chipName = sbeTypeAttributes.at(sbeType).chipName; 201 event = sbeTypeAttributes.at(sbeType).chipOpFailure; 202 203 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION", 204 chipPos); 205 206 // Common FFDC data 207 openpower::dump::pel::FFDCData pelAdditionalData = { 208 {"SRC6", std::format("{:X}{:X}", chipPos, (cmdClass | cmdType))}}; 209 210 if (sbeType == SBETypes::OCMB) 211 { 212 pelAdditionalData.emplace_back( 213 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP)); 214 } 215 216 // Check the error type 217 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 218 { 219 event = sbeTypeAttributes.at(sbeType).chipOpTimeout; 220 dumpIsRequired = true; 221 // For timeout, we do not expect any FFDC packets 222 } 223 else if (sbeError.errType() == 224 openpower::phal::exception::SBE_FFDC_NO_DATA) 225 { 226 // We will create a PEL without FFDC with the common information we 227 // added 228 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}", 229 "CHIP", chipName, "POSITION", chipPos); 230 event = sbeTypeAttributes.at(sbeType).noFfdc; 231 } 232 else 233 { 234 if (sbeError.errType() == 235 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA) 236 { 237 lg2::info( 238 "FFDC Not related to chip-op present {CHIP} {POSITION}", 239 "CHIP", chipName, "POSITION", chipPos); 240 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData; 241 isDumpFailure = false; 242 } 243 else 244 { 245 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName, 246 "POSITION", chipPos); 247 } 248 // Processor FFDC Packets 249 openpower::dump::pel::processFFDCPackets(sbeError, event, 250 pelAdditionalData); 251 } 252 253 // If dump is required, request it 254 if (dumpIsRequired) 255 { 256 auto logId = openpower::dump::pel::createSbeErrorPEL( 257 event, sbeError, pelAdditionalData); 258 util::requestSBEDump(chipPos, logId, sbeType); 259 } 260 } 261 catch (const std::out_of_range& e) 262 { 263 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE", 264 sbeType, "ERROR", e); 265 } 266 catch (const std::exception& e) 267 { 268 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) " 269 "position({CHIPPOS}), Error: {ERROR}", 270 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e); 271 } 272 273 return isDumpFailure; 274 } 275 276 void SbeDumpCollector::collectDumpFromSBE(struct pdbg_target* chip, 277 const std::filesystem::path& path, 278 uint32_t id, uint8_t type, 279 uint8_t clockState, 280 uint64_t failingUnit) 281 { 282 auto chipPos = pdbg_target_index(chip); 283 SBETypes sbeType = getSBEType(chip); 284 auto chipName = sbeTypeAttributes.at(sbeType).chipName; 285 lg2::info( 286 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) " 287 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})", 288 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID", 289 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit); 290 291 util::DumpDataPtr dataPtr; 292 uint32_t len = 0; 293 uint8_t collectFastArray = 294 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos); 295 296 try 297 { 298 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray, 299 dataPtr.getPtr(), &len); 300 } 301 catch (const openpower::phal::sbeError_t& sbeError) 302 { 303 if (sbeError.errType() == 304 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 305 { 306 // SBE is not ready to accept chip-ops, 307 // Skip the request, no additional error handling required. 308 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) " 309 "on proc({PROC}) clock state({CLOCKSTATE})", 310 "ERROR", sbeError, "TYPE", type, "PROC", chipPos, 311 "CLOCKSTATE", clockState); 312 return; 313 } 314 315 // If the FFDC is from actual chip-op failure this function will 316 // return true, if the chip-op is not failed but FFDC is present 317 // then create PELs with FFDC but write the dump contents to the 318 // file. 319 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType, 320 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP)) 321 { 322 lg2::error("Error in collecting dump dump type({TYPE}), " 323 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) " 324 "position({POSITION}), " 325 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})", 326 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE", 327 chipName, "POSITION", chipPos, "COLLECTFASTARRAY", 328 collectFastArray, "ERROR", sbeError); 329 return; 330 } 331 } 332 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len); 333 } 334 335 void SbeDumpCollector::writeDumpFile( 336 const std::filesystem::path& path, const uint32_t id, 337 const uint8_t clockState, const uint8_t nodeNum, 338 const std::string& chipName, const uint8_t chipPos, 339 util::DumpDataPtr& dataPtr, const uint32_t len) 340 { 341 using namespace sdbusplus::xyz::openbmc_project::Common::Error; 342 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error; 343 344 // Construct the filename 345 std::ostringstream filenameBuilder; 346 filenameBuilder << std::setw(8) << std::setfill('0') << id 347 << ".SbeDataClocks" 348 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node" 349 << static_cast<int>(nodeNum) << "." << chipName 350 << static_cast<int>(chipPos); 351 352 auto dumpPath = path / filenameBuilder.str(); 353 354 // Attempt to open the file 355 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary); 356 if (!outfile) 357 { 358 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 359 using metadata = xyz::openbmc_project::Common::File::Open; 360 // Unable to open the file for writing 361 auto err = errno; 362 lg2::error("Error opening file to write dump, " 363 "errno({ERRNO}), filepath({FILEPATH})", 364 "ERRNO", err, "FILEPATH", dumpPath.string()); 365 366 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str())); 367 // Just return here, so that the dumps collected from other 368 // SBEs can be packaged. 369 return; 370 } 371 372 // Write to the file 373 try 374 { 375 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len); 376 377 lg2::info("Successfully wrote dump file " 378 "path=({PATH}) size=({SIZE})", 379 "PATH", dumpPath.string(), "SIZE", len); 380 } 381 catch (const std::ofstream::failure& oe) 382 { 383 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 384 using metadata = xyz::openbmc_project::Common::File::Write; 385 386 lg2::error( 387 "Failed to write to dump file, " 388 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})", 389 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH", 390 dumpPath.string()); 391 report<Write>(metadata::ERRNO(oe.code().value()), 392 metadata::PATH(dumpPath.c_str())); 393 // Just return here so dumps collected from other SBEs can be 394 // packaged. 395 } 396 } 397 398 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target) 399 { 400 try 401 { 402 openpower::phal::sbe::threadStopProc(target); 403 return true; 404 } 405 catch (const openpower::phal::sbeError_t& sbeError) 406 { 407 uint64_t chipPos = pdbg_target_index(target); 408 if (sbeError.errType() == 409 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 410 { 411 lg2::info("SBE is not ready to accept chip-op: Skipping " 412 "stop instruction on proc-({POSITION}) error({ERROR}) ", 413 "POSITION", chipPos, "ERROR", sbeError); 414 return false; // Do not include the target for dump collection 415 } 416 417 lg2::error("Stop instructions failed on " 418 "proc-({POSITION}) error({ERROR}) ", 419 "POSITION", chipPos, "ERROR", sbeError); 420 421 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC, 422 SBEFIFO_CMD_CLASS_INSTRUCTION, 423 SBEFIFO_CMD_CONTROL_INSN); 424 // For TIMEOUT, log the error and skip adding the processor for dump 425 // collection 426 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 427 { 428 return false; 429 } 430 } 431 // Include the target for dump collection for SBE_CMD_FAILED or any other 432 // non-critical errors 433 return true; 434 } 435 436 } // namespace openpower::dump::sbe_chipop 437