1 extern "C" 2 { 3 #include <libpdbg.h> 4 #include <libpdbg_sbe.h> 5 } 6 7 #include "create_pel.hpp" 8 #include "sbe_consts.hpp" 9 #include "sbe_dump_collector.hpp" 10 #include "sbe_type.hpp" 11 12 #include <ekb/hwpf/fapi2/include/target_types.H> 13 #include <libphal.H> 14 #include <phal_exception.H> 15 16 #include <phosphor-logging/elog-errors.hpp> 17 #include <phosphor-logging/lg2.hpp> 18 #include <phosphor-logging/log.hpp> 19 #include <sbe_consts.hpp> 20 #include <xyz/openbmc_project/Common/File/error.hpp> 21 #include <xyz/openbmc_project/Common/error.hpp> 22 23 #include <cstdint> 24 #include <filesystem> 25 #include <format> 26 #include <fstream> 27 #include <stdexcept> 28 29 namespace openpower::dump::sbe_chipop 30 { 31 32 using namespace phosphor::logging; 33 using namespace openpower::dump::SBE; 34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 35 36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id, 37 uint64_t failingUnit, 38 const std::filesystem::path& path) 39 { 40 lg2::error("Starting dump collection: type:{TYPE} id:{ID} " 41 "failingUnit:{FAILINGUNIT}, path:{PATH}", 42 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH", 43 path.string()); 44 45 initializePdbg(); 46 47 TargetMap targets; 48 49 struct pdbg_target* target = nullptr; 50 pdbg_for_each_class_target("proc", target) 51 { 52 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED || 53 !openpower::phal::pdbg::isTgtFunctional(target)) 54 { 55 continue; 56 } 57 58 bool includeTarget = true; 59 // if the dump type is hostboot then call stop instructions 60 if (type == SBE_DUMP_TYPE_HOSTBOOT) 61 { 62 includeTarget = executeThreadStop(target, path); 63 } 64 if (includeTarget) 65 { 66 targets[target] = std::vector<struct pdbg_target*>(); 67 68 // Hardware dump needs OCMB data if present 69 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE) 70 { 71 struct pdbg_target* ocmbTarget; 72 pdbg_for_each_target("ocmb", target, ocmbTarget) 73 { 74 if (!is_ody_ocmb_chip(ocmbTarget)) 75 { 76 continue; 77 } 78 79 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED) 80 { 81 continue; 82 } 83 84 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget)) 85 { 86 continue; 87 } 88 targets[target].push_back(ocmbTarget); 89 } 90 } 91 } 92 } 93 94 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF}; 95 for (auto cstate : clockStates) 96 { 97 // Skip collection for performance dump if clock state is not ON 98 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON) 99 { 100 continue; 101 } 102 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit, 103 cstate, targets); 104 105 // Wait for all asynchronous tasks to complete 106 for (auto& future : futures) 107 { 108 try 109 { 110 future.wait(); 111 } 112 catch (const std::exception& e) 113 { 114 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})", 115 "ERROR", e); 116 } 117 } 118 lg2::info( 119 "Dump collection completed for clock state({CSTATE}): type({TYPE}) " 120 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})", 121 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT", 122 failingUnit, "PATH", path.string()); 123 } 124 if (std::filesystem::is_empty(path)) 125 { 126 lg2::error("Failed to collect the dump"); 127 throw std::runtime_error("Failed to collect the dump"); 128 } 129 lg2::info("Dump collection completed"); 130 } 131 132 void SbeDumpCollector::initializePdbg() 133 { 134 openpower::phal::pdbg::init(); 135 } 136 137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses( 138 uint8_t type, uint32_t id, const std::filesystem::path& path, 139 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap) 140 { 141 std::vector<std::future<void>> futures; 142 143 for (const auto& [procTarget, ocmbTargets] : targetMap) 144 { 145 auto future = std::async(std::launch::async, [this, procTarget, 146 ocmbTargets, path, id, 147 type, cstate, 148 failingUnit]() { 149 try 150 { 151 this->collectDumpFromSBE(procTarget, path, id, type, cstate, 152 failingUnit); 153 } 154 catch (const std::exception& e) 155 { 156 lg2::error( 157 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}", 158 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e); 159 } 160 161 // Collect OCMBs only with clock on 162 if (cstate == SBE_CLOCK_ON) 163 { 164 // Handle OCMBs serially after handling the proc 165 for (auto ocmbTarget : ocmbTargets) 166 { 167 try 168 { 169 this->collectDumpFromSBE(ocmbTarget, path, id, type, 170 cstate, failingUnit); 171 } 172 catch (const std::exception& e) 173 { 174 lg2::error( 175 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}", 176 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR", 177 e); 178 } 179 } 180 } 181 }); 182 183 futures.push_back(std::move(future)); 184 } 185 186 return futures; 187 } 188 189 bool SbeDumpCollector::logErrorAndCreatePEL( 190 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos, 191 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType, 192 const std::filesystem::path& path) 193 { 194 namespace fs = std::filesystem; 195 196 std::string chipName; 197 std::string event; 198 bool dumpIsRequired = false; 199 bool isDumpFailure = true; 200 try 201 { 202 chipName = sbeTypeAttributes.at(sbeType).chipName; 203 event = sbeTypeAttributes.at(sbeType).chipOpFailure; 204 205 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION", 206 chipPos); 207 208 // Common FFDC data 209 openpower::dump::pel::FFDCData pelAdditionalData = { 210 {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}}; 211 212 if (sbeType == SBETypes::OCMB) 213 { 214 pelAdditionalData.emplace_back( 215 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP)); 216 } 217 218 // Check the error type 219 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 220 { 221 event = sbeTypeAttributes.at(sbeType).chipOpTimeout; 222 dumpIsRequired = true; 223 // For timeout, we do not expect any FFDC packets 224 } 225 else if (sbeError.errType() == 226 openpower::phal::exception::SBE_FFDC_NO_DATA) 227 { 228 // We will create a PEL without FFDC with the common information we 229 // added 230 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}", 231 "CHIP", chipName, "POSITION", chipPos); 232 event = sbeTypeAttributes.at(sbeType).noFfdc; 233 } 234 else 235 { 236 if (sbeError.errType() == 237 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA) 238 { 239 lg2::info( 240 "FFDC Not related to chip-op present {CHIP} {POSITION}", 241 "CHIP", chipName, "POSITION", chipPos); 242 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData; 243 isDumpFailure = false; 244 } 245 else 246 { 247 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName, 248 "POSITION", chipPos); 249 } 250 // Processor FFDC Packets 251 std::vector<uint32_t> logIdList = 252 openpower::dump::pel::processFFDCPackets(sbeError, event, 253 pelAdditionalData); 254 for (auto logId : logIdList) 255 { 256 try 257 { 258 auto logInfo = openpower::dump::pel::getLogInfo(logId); 259 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo), 260 chipName, chipPos, path.parent_path()); 261 } 262 catch (const std::exception& e) 263 { 264 lg2::error("Failed to get error Info: {ERROR} ", "ERROR", 265 e); 266 } 267 } 268 } 269 270 // If dump is required, request it 271 if (dumpIsRequired) 272 { 273 auto logId = openpower::dump::pel::createSbeErrorPEL( 274 event, sbeError, pelAdditionalData); 275 try 276 { 277 auto logInfo = openpower::dump::pel::getLogInfo(logId); 278 addLogDataToDump(std::get<0>(logInfo), std::get<1>(logInfo), 279 chipName, chipPos, path.parent_path()); 280 util::requestSBEDump(chipPos, std::get<0>(logInfo), sbeType); 281 } 282 catch (const std::exception& e) 283 { 284 lg2::error( 285 "Failed to get error Info, failed to create sbe dump: {ERROR}", 286 "ERROR", e); 287 } 288 } 289 } 290 catch (const std::out_of_range& e) 291 { 292 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE", 293 sbeType, "ERROR", e); 294 } 295 catch (const std::exception& e) 296 { 297 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) " 298 "position({CHIPPOS}), Error: {ERROR}", 299 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e); 300 } 301 302 return isDumpFailure; 303 } 304 305 void SbeDumpCollector::collectDumpFromSBE( 306 struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id, 307 uint8_t type, uint8_t clockState, uint64_t failingUnit) 308 { 309 auto chipPos = pdbg_target_index(chip); 310 SBETypes sbeType = getSBEType(chip); 311 auto chipName = sbeTypeAttributes.at(sbeType).chipName; 312 lg2::info( 313 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) " 314 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})", 315 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID", 316 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit); 317 318 util::DumpDataPtr dataPtr; 319 uint32_t len = 0; 320 uint8_t collectFastArray = 321 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos); 322 323 try 324 { 325 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray, 326 dataPtr.getPtr(), &len); 327 } 328 catch (const openpower::phal::sbeError_t& sbeError) 329 { 330 if (sbeError.errType() == 331 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 332 { 333 // SBE is not ready to accept chip-ops, 334 // Skip the request, no additional error handling required. 335 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) " 336 "on proc({PROC}) clock state({CLOCKSTATE})", 337 "ERROR", sbeError, "TYPE", type, "PROC", chipPos, 338 "CLOCKSTATE", clockState); 339 return; 340 } 341 342 // If the FFDC is from actual chip-op failure this function will 343 // return true, if the chip-op is not failed but FFDC is present 344 // then create PELs with FFDC but write the dump contents to the 345 // file. 346 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType, 347 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP, 348 path)) 349 { 350 lg2::error("Error in collecting dump dump type({TYPE}), " 351 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) " 352 "position({POSITION}), " 353 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})", 354 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE", 355 chipName, "POSITION", chipPos, "COLLECTFASTARRAY", 356 collectFastArray, "ERROR", sbeError); 357 return; 358 } 359 } 360 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len); 361 } 362 363 void SbeDumpCollector::writeDumpFile( 364 const std::filesystem::path& path, const uint32_t id, 365 const uint8_t clockState, const uint8_t nodeNum, 366 const std::string& chipName, const uint8_t chipPos, 367 util::DumpDataPtr& dataPtr, const uint32_t len) 368 { 369 using namespace sdbusplus::xyz::openbmc_project::Common::Error; 370 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error; 371 372 // Construct the filename 373 std::ostringstream filenameBuilder; 374 filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id 375 << ".SbeDataClocks" 376 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node" 377 << std::dec << static_cast<int>(nodeNum) << "." << chipName 378 << static_cast<int>(chipPos); 379 380 auto dumpPath = path / filenameBuilder.str(); 381 382 // Attempt to open the file 383 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary); 384 if (!outfile) 385 { 386 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 387 using metadata = xyz::openbmc_project::Common::File::Open; 388 // Unable to open the file for writing 389 auto err = errno; 390 lg2::error("Error opening file to write dump, " 391 "errno({ERRNO}), filepath({FILEPATH})", 392 "ERRNO", err, "FILEPATH", dumpPath.string()); 393 394 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str())); 395 // Just return here, so that the dumps collected from other 396 // SBEs can be packaged. 397 return; 398 } 399 400 // Write to the file 401 try 402 { 403 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len); 404 405 lg2::info("Successfully wrote dump file " 406 "path=({PATH}) size=({SIZE})", 407 "PATH", dumpPath.string(), "SIZE", len); 408 } 409 catch (const std::ofstream::failure& oe) 410 { 411 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 412 using metadata = xyz::openbmc_project::Common::File::Write; 413 414 lg2::error( 415 "Failed to write to dump file, " 416 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})", 417 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH", 418 dumpPath.string()); 419 report<Write>(metadata::ERRNO(oe.code().value()), 420 metadata::PATH(dumpPath.c_str())); 421 // Just return here so dumps collected from other SBEs can be 422 // packaged. 423 } 424 } 425 426 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target, 427 const std::filesystem::path& path) 428 { 429 try 430 { 431 openpower::phal::sbe::threadStopProc(target); 432 return true; 433 } 434 catch (const openpower::phal::sbeError_t& sbeError) 435 { 436 uint64_t chipPos = pdbg_target_index(target); 437 if (sbeError.errType() == 438 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 439 { 440 lg2::info("SBE is not ready to accept chip-op: Skipping " 441 "stop instruction on proc-({POSITION}) error({ERROR}) ", 442 "POSITION", chipPos, "ERROR", sbeError); 443 return false; // Do not include the target for dump collection 444 } 445 446 lg2::error("Stop instructions failed on " 447 "proc-({POSITION}) error({ERROR}) ", 448 "POSITION", chipPos, "ERROR", sbeError); 449 450 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC, 451 SBEFIFO_CMD_CLASS_INSTRUCTION, 452 SBEFIFO_CMD_CONTROL_INSN, path); 453 // For TIMEOUT, log the error and skip adding the processor for dump 454 // collection 455 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 456 { 457 return false; 458 } 459 } 460 // Include the target for dump collection for SBE_CMD_FAILED or any other 461 // non-critical errors 462 return true; 463 } 464 465 void SbeDumpCollector::addLogDataToDump(uint32_t pelId, std::string src, 466 std::string chipName, uint64_t chipPos, 467 const std::filesystem::path& path) 468 { 469 std::filesystem::path info = path / "errorInfo"; 470 auto fileExists = std::filesystem::exists(info); 471 std::ofstream fout; 472 fout.open(info, std::ios::app); 473 if (!fout) 474 { 475 lg2::error("Error: Failed to open the file! {FILE}", "FILE", info); 476 lg2::error("No error Info is added to dump file"); 477 return; 478 } 479 if (!fileExists) 480 { 481 fout << "ErrorInfo:" << std::endl; 482 } 483 auto pel = " " + std::format("{:08x}", pelId) + ":"; 484 fout << pel << std::endl; 485 fout << " src: " << src << std::endl; 486 auto resource = chipName + " " + std::to_string(chipPos); 487 fout << " Resource: " << resource << std::endl; 488 } 489 490 } // namespace openpower::dump::sbe_chipop 491