1 extern "C" 2 { 3 #include <libpdbg.h> 4 #include <libpdbg_sbe.h> 5 } 6 7 #include "create_pel.hpp" 8 #include "sbe_consts.hpp" 9 #include "sbe_dump_collector.hpp" 10 #include "sbe_type.hpp" 11 12 #include <ekb/hwpf/fapi2/include/target_types.H> 13 #include <libphal.H> 14 #include <phal_exception.H> 15 16 #include <phosphor-logging/elog-errors.hpp> 17 #include <phosphor-logging/lg2.hpp> 18 #include <phosphor-logging/log.hpp> 19 #include <sbe_consts.hpp> 20 #include <xyz/openbmc_project/Common/File/error.hpp> 21 #include <xyz/openbmc_project/Common/error.hpp> 22 23 #include <cstdint> 24 #include <filesystem> 25 #include <format> 26 #include <fstream> 27 #include <stdexcept> 28 29 namespace openpower::dump::sbe_chipop 30 { 31 32 using namespace phosphor::logging; 33 using namespace openpower::dump::SBE; 34 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 35 36 void SbeDumpCollector::collectDump(uint8_t type, uint32_t id, 37 uint64_t failingUnit, 38 const std::filesystem::path& path) 39 { 40 lg2::error("Starting dump collection: type:{TYPE} id:{ID} " 41 "failingUnit:{FAILINGUNIT}, path:{PATH}", 42 "TYPE", type, "ID", id, "FAILINGUNIT", failingUnit, "PATH", 43 path.string()); 44 45 initializePdbg(); 46 47 TargetMap targets; 48 49 struct pdbg_target* target = nullptr; 50 pdbg_for_each_class_target("proc", target) 51 { 52 if (pdbg_target_probe(target) != PDBG_TARGET_ENABLED || 53 !openpower::phal::pdbg::isTgtFunctional(target)) 54 { 55 continue; 56 } 57 58 bool includeTarget = true; 59 // if the dump type is hostboot then call stop instructions 60 if (type == SBE_DUMP_TYPE_HOSTBOOT) 61 { 62 includeTarget = executeThreadStop(target); 63 } 64 if (includeTarget) 65 { 66 targets[target] = std::vector<struct pdbg_target*>(); 67 68 // Hardware dump needs OCMB data if present 69 if (type == openpower::dump::SBE::SBE_DUMP_TYPE_HARDWARE) 70 { 71 struct pdbg_target* ocmbTarget; 72 pdbg_for_each_target("ocmb", target, ocmbTarget) 73 { 74 if (!is_ody_ocmb_chip(ocmbTarget)) 75 { 76 continue; 77 } 78 79 if (pdbg_target_probe(ocmbTarget) != PDBG_TARGET_ENABLED) 80 { 81 continue; 82 } 83 84 if (!openpower::phal::pdbg::isTgtFunctional(ocmbTarget)) 85 { 86 continue; 87 } 88 targets[target].push_back(ocmbTarget); 89 } 90 } 91 } 92 } 93 94 std::vector<uint8_t> clockStates = {SBE_CLOCK_ON, SBE_CLOCK_OFF}; 95 for (auto cstate : clockStates) 96 { 97 // Skip collection for performance dump if clock state is not ON 98 if (type == SBE_DUMP_TYPE_PERFORMANCE && cstate != SBE_CLOCK_ON) 99 { 100 continue; 101 } 102 auto futures = spawnDumpCollectionProcesses(type, id, path, failingUnit, 103 cstate, targets); 104 105 // Wait for all asynchronous tasks to complete 106 for (auto& future : futures) 107 { 108 try 109 { 110 future.wait(); 111 } 112 catch (const std::exception& e) 113 { 114 lg2::error("Failed to collect dump from SBE ErrorMsg({ERROR})", 115 "ERROR", e); 116 } 117 } 118 lg2::info( 119 "Dump collection completed for clock state({CSTATE}): type({TYPE}) " 120 "id({ID}) failingUnit({FAILINGUNIT}), path({PATH})", 121 "CSTATE", cstate, "TYPE", type, "ID", id, "FAILINGUNIT", 122 failingUnit, "PATH", path.string()); 123 } 124 if (std::filesystem::is_empty(path)) 125 { 126 lg2::error("Failed to collect the dump"); 127 throw std::runtime_error("Failed to collect the dump"); 128 } 129 lg2::info("Dump collection completed"); 130 } 131 132 void SbeDumpCollector::initializePdbg() 133 { 134 openpower::phal::pdbg::init(); 135 } 136 137 std::vector<std::future<void>> SbeDumpCollector::spawnDumpCollectionProcesses( 138 uint8_t type, uint32_t id, const std::filesystem::path& path, 139 uint64_t failingUnit, uint8_t cstate, const TargetMap& targetMap) 140 { 141 std::vector<std::future<void>> futures; 142 143 for (const auto& [procTarget, ocmbTargets] : targetMap) 144 { 145 auto future = std::async(std::launch::async, [this, procTarget, 146 ocmbTargets, path, id, 147 type, cstate, 148 failingUnit]() { 149 try 150 { 151 this->collectDumpFromSBE(procTarget, path, id, type, cstate, 152 failingUnit); 153 } 154 catch (const std::exception& e) 155 { 156 lg2::error( 157 "Failed to collect dump from SBE on Proc-({PROCINDEX}) {ERROR}", 158 "PROCINDEX", pdbg_target_index(procTarget), "ERROR", e); 159 } 160 161 // Collect OCMBs only with clock on 162 if (cstate == SBE_CLOCK_ON) 163 { 164 // Handle OCMBs serially after handling the proc 165 for (auto ocmbTarget : ocmbTargets) 166 { 167 try 168 { 169 this->collectDumpFromSBE(ocmbTarget, path, id, type, 170 cstate, failingUnit); 171 } 172 catch (const std::exception& e) 173 { 174 lg2::error( 175 "Failed to collect dump from OCMB -({OCMBINDEX}) {ERROR}", 176 "OCMBINDEX", pdbg_target_index(ocmbTarget), "ERROR", 177 e); 178 } 179 } 180 } 181 }); 182 183 futures.push_back(std::move(future)); 184 } 185 186 return futures; 187 } 188 189 bool SbeDumpCollector::logErrorAndCreatePEL( 190 const openpower::phal::sbeError_t& sbeError, uint64_t chipPos, 191 SBETypes sbeType, uint32_t cmdClass, uint32_t cmdType) 192 { 193 namespace fs = std::filesystem; 194 195 std::string chipName; 196 std::string event; 197 bool dumpIsRequired = false; 198 bool isDumpFailure = true; 199 try 200 { 201 chipName = sbeTypeAttributes.at(sbeType).chipName; 202 event = sbeTypeAttributes.at(sbeType).chipOpFailure; 203 204 lg2::info("log error {CHIP} {POSITION}", "CHIP", chipName, "POSITION", 205 chipPos); 206 207 // Common FFDC data 208 openpower::dump::pel::FFDCData pelAdditionalData = { 209 {"SRC6", std::format("0x{:X}{:X}", chipPos, (cmdClass | cmdType))}}; 210 211 if (sbeType == SBETypes::OCMB) 212 { 213 pelAdditionalData.emplace_back( 214 "CHIP_TYPE", std::to_string(fapi2::TARGET_TYPE_OCMB_CHIP)); 215 } 216 217 // Check the error type 218 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 219 { 220 event = sbeTypeAttributes.at(sbeType).chipOpTimeout; 221 dumpIsRequired = true; 222 // For timeout, we do not expect any FFDC packets 223 } 224 else if (sbeError.errType() == 225 openpower::phal::exception::SBE_FFDC_NO_DATA) 226 { 227 // We will create a PEL without FFDC with the common information we 228 // added 229 lg2::error("No FFDC data after a chip-op failure {CHIP} {POSITION}", 230 "CHIP", chipName, "POSITION", chipPos); 231 event = sbeTypeAttributes.at(sbeType).noFfdc; 232 } 233 else 234 { 235 if (sbeError.errType() == 236 openpower::phal::exception::SBE_INTERNAL_FFDC_DATA) 237 { 238 lg2::info( 239 "FFDC Not related to chip-op present {CHIP} {POSITION}", 240 "CHIP", chipName, "POSITION", chipPos); 241 event = sbeTypeAttributes.at(sbeType).sbeInternalFFDCData; 242 isDumpFailure = false; 243 } 244 else 245 { 246 lg2::error("Process FFDC {CHIP} {POSITION}", "CHIP", chipName, 247 "POSITION", chipPos); 248 } 249 // Processor FFDC Packets 250 openpower::dump::pel::processFFDCPackets(sbeError, event, 251 pelAdditionalData); 252 } 253 254 // If dump is required, request it 255 if (dumpIsRequired) 256 { 257 auto logId = openpower::dump::pel::createSbeErrorPEL( 258 event, sbeError, pelAdditionalData); 259 util::requestSBEDump(chipPos, logId, sbeType); 260 } 261 } 262 catch (const std::out_of_range& e) 263 { 264 lg2::error("Unknown SBE Type({SBETYPE}) ErrorMsg({ERROR})", "SBETYPE", 265 sbeType, "ERROR", e); 266 } 267 catch (const std::exception& e) 268 { 269 lg2::error("SBE Dump request failed, chip type({CHIPTYPE}) " 270 "position({CHIPPOS}), Error: {ERROR}", 271 "CHIPTYPE", chipName, "CHIPPOS", chipPos, "ERROR", e); 272 } 273 274 return isDumpFailure; 275 } 276 277 void SbeDumpCollector::collectDumpFromSBE( 278 struct pdbg_target* chip, const std::filesystem::path& path, uint32_t id, 279 uint8_t type, uint8_t clockState, uint64_t failingUnit) 280 { 281 auto chipPos = pdbg_target_index(chip); 282 SBETypes sbeType = getSBEType(chip); 283 auto chipName = sbeTypeAttributes.at(sbeType).chipName; 284 lg2::info( 285 "Collecting dump from ({CHIPTYPE}) ({POSITION}): path({PATH}) id({ID}) " 286 "type({TYPE}) clockState({CLOCKSTATE}) failingUnit({FAILINGUNIT})", 287 "CHIPTYPE", chipName, "POSITION", chipPos, "PATH", path.string(), "ID", 288 id, "TYPE", type, "CLOCKSTATE", clockState, "FAILINGUNIT", failingUnit); 289 290 util::DumpDataPtr dataPtr; 291 uint32_t len = 0; 292 uint8_t collectFastArray = 293 checkFastarrayCollectionNeeded(clockState, type, failingUnit, chipPos); 294 295 try 296 { 297 openpower::phal::sbe::getDump(chip, type, clockState, collectFastArray, 298 dataPtr.getPtr(), &len); 299 } 300 catch (const openpower::phal::sbeError_t& sbeError) 301 { 302 if (sbeError.errType() == 303 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 304 { 305 // SBE is not ready to accept chip-ops, 306 // Skip the request, no additional error handling required. 307 lg2::info("Collect dump: Skipping ({ERROR}) dump({TYPE}) " 308 "on proc({PROC}) clock state({CLOCKSTATE})", 309 "ERROR", sbeError, "TYPE", type, "PROC", chipPos, 310 "CLOCKSTATE", clockState); 311 return; 312 } 313 314 // If the FFDC is from actual chip-op failure this function will 315 // return true, if the chip-op is not failed but FFDC is present 316 // then create PELs with FFDC but write the dump contents to the 317 // file. 318 if (logErrorAndCreatePEL(sbeError, chipPos, sbeType, 319 SBEFIFO_CMD_CLASS_DUMP, SBEFIFO_CMD_GET_DUMP)) 320 { 321 lg2::error("Error in collecting dump dump type({TYPE}), " 322 "clockstate({CLOCKSTATE}), chip type({CHIPTYPE}) " 323 "position({POSITION}), " 324 "collectFastArray({COLLECTFASTARRAY}) error({ERROR})", 325 "TYPE", type, "CLOCKSTATE", clockState, "CHIPTYPE", 326 chipName, "POSITION", chipPos, "COLLECTFASTARRAY", 327 collectFastArray, "ERROR", sbeError); 328 return; 329 } 330 } 331 writeDumpFile(path, id, clockState, 0, chipName, chipPos, dataPtr, len); 332 } 333 334 void SbeDumpCollector::writeDumpFile( 335 const std::filesystem::path& path, const uint32_t id, 336 const uint8_t clockState, const uint8_t nodeNum, 337 const std::string& chipName, const uint8_t chipPos, 338 util::DumpDataPtr& dataPtr, const uint32_t len) 339 { 340 using namespace sdbusplus::xyz::openbmc_project::Common::Error; 341 namespace fileError = sdbusplus::xyz::openbmc_project::Common::File::Error; 342 343 // Construct the filename 344 std::ostringstream filenameBuilder; 345 filenameBuilder << std::hex << std::setw(8) << std::setfill('0') << id 346 << ".SbeDataClocks" 347 << (clockState == SBE_CLOCK_ON ? "On" : "Off") << ".node" 348 << std::dec << static_cast<int>(nodeNum) << "." << chipName 349 << static_cast<int>(chipPos); 350 351 auto dumpPath = path / filenameBuilder.str(); 352 353 // Attempt to open the file 354 std::ofstream outfile(dumpPath, std::ios::out | std::ios::binary); 355 if (!outfile) 356 { 357 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 358 using metadata = xyz::openbmc_project::Common::File::Open; 359 // Unable to open the file for writing 360 auto err = errno; 361 lg2::error("Error opening file to write dump, " 362 "errno({ERRNO}), filepath({FILEPATH})", 363 "ERRNO", err, "FILEPATH", dumpPath.string()); 364 365 report<Open>(metadata::ERRNO(err), metadata::PATH(dumpPath.c_str())); 366 // Just return here, so that the dumps collected from other 367 // SBEs can be packaged. 368 return; 369 } 370 371 // Write to the file 372 try 373 { 374 outfile.write(reinterpret_cast<const char*>(dataPtr.getData()), len); 375 376 lg2::info("Successfully wrote dump file " 377 "path=({PATH}) size=({SIZE})", 378 "PATH", dumpPath.string(), "SIZE", len); 379 } 380 catch (const std::ofstream::failure& oe) 381 { 382 using namespace sdbusplus::xyz::openbmc_project::Common::File::Error; 383 using metadata = xyz::openbmc_project::Common::File::Write; 384 385 lg2::error( 386 "Failed to write to dump file, " 387 "errorMsg({ERROR}), error({ERRORCODE}), filepath({FILEPATH})", 388 "ERROR", oe, "ERRORCODE", oe.code().value(), "FILEPATH", 389 dumpPath.string()); 390 report<Write>(metadata::ERRNO(oe.code().value()), 391 metadata::PATH(dumpPath.c_str())); 392 // Just return here so dumps collected from other SBEs can be 393 // packaged. 394 } 395 } 396 397 bool SbeDumpCollector::executeThreadStop(struct pdbg_target* target) 398 { 399 try 400 { 401 openpower::phal::sbe::threadStopProc(target); 402 return true; 403 } 404 catch (const openpower::phal::sbeError_t& sbeError) 405 { 406 uint64_t chipPos = pdbg_target_index(target); 407 if (sbeError.errType() == 408 openpower::phal::exception::SBE_CHIPOP_NOT_ALLOWED) 409 { 410 lg2::info("SBE is not ready to accept chip-op: Skipping " 411 "stop instruction on proc-({POSITION}) error({ERROR}) ", 412 "POSITION", chipPos, "ERROR", sbeError); 413 return false; // Do not include the target for dump collection 414 } 415 416 lg2::error("Stop instructions failed on " 417 "proc-({POSITION}) error({ERROR}) ", 418 "POSITION", chipPos, "ERROR", sbeError); 419 420 logErrorAndCreatePEL(sbeError, chipPos, SBETypes::PROC, 421 SBEFIFO_CMD_CLASS_INSTRUCTION, 422 SBEFIFO_CMD_CONTROL_INSN); 423 // For TIMEOUT, log the error and skip adding the processor for dump 424 // collection 425 if (sbeError.errType() == openpower::phal::exception::SBE_CMD_TIMEOUT) 426 { 427 return false; 428 } 429 } 430 // Include the target for dump collection for SBE_CMD_FAILED or any other 431 // non-critical errors 432 return true; 433 } 434 435 } // namespace openpower::dump::sbe_chipop 436