1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "ucd90160.hpp" 17 18 #include "names_values.hpp" 19 #include "utility.hpp" 20 21 #include <elog-errors.hpp> 22 #include <org/open_power/Witherspoon/Fault/error.hpp> 23 #include <phosphor-logging/elog.hpp> 24 #include <phosphor-logging/log.hpp> 25 #include <xyz/openbmc_project/Common/Device/error.hpp> 26 27 #include <map> 28 #include <memory> 29 30 namespace phosphor 31 { 32 namespace power 33 { 34 35 using namespace std::string_literals; 36 37 const auto MFR_STATUS = "mfr_status"s; 38 39 const auto DEVICE_NAME = "UCD90160"s; 40 const auto DRIVER_NAME = "ucd9000"s; 41 constexpr auto NUM_PAGES = 16; 42 43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory"; 44 45 namespace fs = std::filesystem; 46 using namespace gpio; 47 using namespace pmbus; 48 using namespace phosphor::logging; 49 50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error; 51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error; 52 53 UCD90160::UCD90160(size_t instance, sdbusplus::bus_t& bus) : 54 Device(DEVICE_NAME, instance), 55 interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second), 56 DRIVER_NAME, instance), 57 gpioDevice(findGPIODevice(interface.path())), bus(bus) 58 {} 59 60 void UCD90160::onFailure() 61 { 62 try 63 { 64 auto voutError = checkVOUTFaults(); 65 66 auto pgoodError = checkPGOODFaults(false); 67 68 // Not a voltage or PGOOD fault, but we know something 69 // failed so still create an error log. 70 if (!voutError && !pgoodError) 71 { 72 createPowerFaultLog(); 73 } 74 } 75 catch (const device_error::ReadFailure& e) 76 { 77 if (!accessError) 78 { 79 commit<device_error::ReadFailure>(); 80 accessError = true; 81 } 82 } 83 } 84 85 void UCD90160::analyze() 86 { 87 try 88 { 89 // Note: Voltage faults are always fatal, so they just 90 // need to be analyzed in onFailure(). 91 92 checkPGOODFaults(true); 93 } 94 catch (const device_error::ReadFailure& e) 95 { 96 if (!accessError) 97 { 98 commit<device_error::ReadFailure>(); 99 accessError = true; 100 } 101 } 102 } 103 104 uint16_t UCD90160::readStatusWord() 105 { 106 return interface.read(STATUS_WORD, Type::Debug); 107 } 108 109 uint32_t UCD90160::readMFRStatus() 110 { 111 return interface.read(MFR_STATUS, Type::HwmonDeviceDebug); 112 } 113 114 bool UCD90160::checkVOUTFaults() 115 { 116 bool errorCreated = false; 117 auto statusWord = readStatusWord(); 118 119 // The status_word register has a summary bit to tell us 120 // if each page even needs to be checked 121 if (!(statusWord & status_word::VOUT_FAULT)) 122 { 123 return errorCreated; 124 } 125 126 for (size_t page = 0; page < NUM_PAGES; page++) 127 { 128 if (isVoutFaultLogged(page)) 129 { 130 continue; 131 } 132 133 auto statusVout = interface.insertPageNum(STATUS_VOUT, page); 134 uint8_t vout = interface.read(statusVout, Type::Debug); 135 136 // If any bits are on log them, though some are just 137 // warnings so they won't cause errors 138 if (vout) 139 { 140 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT", 141 entry("STATUS_VOUT=0x%X", vout), 142 entry("PAGE=%d", page)); 143 } 144 145 // Log errors if any non-warning bits on 146 if (vout & ~status_vout::WARNING_MASK) 147 { 148 auto& railNames = std::get<ucd90160::railNamesField>( 149 deviceMap.find(getInstance())->second); 150 auto railName = railNames.at(page); 151 152 util::NamesValues nv; 153 try 154 { 155 nv.add("STATUS_WORD", statusWord); 156 nv.add("STATUS_VOUT", vout); 157 nv.add("MFR_STATUS", readMFRStatus()); 158 } 159 catch (const device_error::ReadFailure& e) 160 { 161 log<level::ERR>("ReadFailure when collecting metadata"); 162 commit<device_error::ReadFailure>(); 163 } 164 165 using metadata = 166 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault; 167 168 report<power_error::PowerSequencerVoltageFault>( 169 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()), 170 metadata::RAW_STATUS(nv.get().c_str())); 171 172 setVoutFaultLogged(page); 173 errorCreated = true; 174 } 175 } 176 177 return errorCreated; 178 } 179 180 bool UCD90160::checkPGOODFaults(bool polling) 181 { 182 bool errorCreated = false; 183 184 // While PGOOD faults could show up in MFR_STATUS (and we could then 185 // check the summary bit in STATUS_WORD first), they are edge triggered, 186 // and as the device driver sends a clear faults command every time we 187 // do a read, we will never see them. So, we'll have to just read the 188 // real time GPI status GPIO. 189 190 // Check only the GPIs configured on this system. 191 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>( 192 deviceMap.find(getInstance())->second); 193 194 for (const auto& gpiConfig : gpiConfigs) 195 { 196 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig); 197 auto doPoll = std::get<ucd90160::pollField>(gpiConfig); 198 199 // Can skip this one if there is already an error on this input, 200 // or we are polling and these inputs don't need to be polled 201 //(because errors on them are fatal). 202 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll)) 203 { 204 continue; 205 } 206 207 // The real time status is read via the pin ID 208 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig); 209 auto gpio = gpios.find(pinID); 210 Value gpiStatus; 211 212 try 213 { 214 // The first time through, create the GPIO objects 215 if (gpio == gpios.end()) 216 { 217 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID, 218 Direction::input)); 219 gpio = gpios.find(pinID); 220 } 221 222 gpiStatus = gpio->second->read(); 223 } 224 catch (const std::exception& e) 225 { 226 if (!accessError) 227 { 228 log<level::ERR>(e.what()); 229 accessError = true; 230 } 231 continue; 232 } 233 234 if (gpiStatus == Value::low) 235 { 236 // There may be some extra analysis we can do to narrow the 237 // error down further. Note that finding an error here won't 238 // prevent us from checking this GPI again. 239 errorCreated = doExtraAnalysis(gpiConfig); 240 241 if (errorCreated) 242 { 243 continue; 244 } 245 246 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig); 247 auto status = (gpiStatus == Value::low) ? 0 : 1; 248 249 util::NamesValues nv; 250 251 try 252 { 253 nv.add("STATUS_WORD", readStatusWord()); 254 nv.add("MFR_STATUS", readMFRStatus()); 255 nv.add("INPUT_STATUS", status); 256 } 257 catch (const device_error::ReadFailure& e) 258 { 259 log<level::ERR>("ReadFailure when collecting metadata"); 260 commit<device_error::ReadFailure>(); 261 } 262 263 using metadata = 264 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault; 265 266 report<power_error::PowerSequencerPGOODFault>( 267 metadata::INPUT_NUM(gpiNum), 268 metadata::INPUT_NAME(gpiName.c_str()), 269 metadata::RAW_STATUS(nv.get().c_str())); 270 271 setPGOODFaultLogged(gpiNum); 272 errorCreated = true; 273 } 274 } 275 276 return errorCreated; 277 } 278 279 void UCD90160::createPowerFaultLog() 280 { 281 util::NamesValues nv; 282 283 try 284 { 285 nv.add("STATUS_WORD", readStatusWord()); 286 nv.add("MFR_STATUS", readMFRStatus()); 287 } 288 catch (const device_error::ReadFailure& e) 289 { 290 log<level::ERR>("ReadFailure when collecting metadata"); 291 commit<device_error::ReadFailure>(); 292 } 293 294 using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault; 295 296 report<power_error::PowerSequencerFault>( 297 metadata::RAW_STATUS(nv.get().c_str())); 298 } 299 300 fs::path UCD90160::findGPIODevice(const fs::path& path) 301 { 302 fs::path gpioDevicePath; 303 304 // In the driver directory, look for a subdirectory 305 // named gpiochipX, where X is some number. Then 306 // we'll access the GPIO at /dev/gpiochipX. 307 if (fs::is_directory(path)) 308 { 309 for (auto& f : fs::directory_iterator(path)) 310 { 311 if (f.path().filename().string().find("gpiochip") != 312 std::string::npos) 313 { 314 gpioDevicePath = "/dev" / f.path().filename(); 315 break; 316 } 317 } 318 } 319 320 if (gpioDevicePath.empty()) 321 { 322 log<level::ERR>("Could not find GPIO device path", 323 entry("BASE_PATH=%s", path.c_str())); 324 } 325 326 return gpioDevicePath; 327 } 328 329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config) 330 { 331 332 auto type = std::get<ucd90160::extraAnalysisField>(config); 333 if (type == ucd90160::extraAnalysisType::none) 334 { 335 return false; 336 } 337 338 // Currently the only extra analysis to do is to check other GPIOs. 339 return doGPIOAnalysis(type); 340 } 341 342 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type) 343 { 344 bool errorFound = false; 345 bool shutdown = false; 346 347 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>( 348 deviceMap.find(getInstance())->second); 349 350 auto gpioConfig = analysisConfig.find(type); 351 if (gpioConfig == analysisConfig.end()) 352 { 353 return errorFound; 354 } 355 356 auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second); 357 358 // The /dev/gpiochipX device 359 auto device = findGPIODevice(path); 360 361 if (device.empty()) 362 { 363 log<level::ERR>( 364 "Missing GPIO device - cannot do GPIO analysis of fault", 365 entry("ANALYSIS_TYPE=%d\n", type)); 366 return errorFound; 367 } 368 369 // The GPIO value of the fault condition 370 auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second); 371 372 // The GPIOs to check 373 auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second); 374 375 for (const auto& gpio : gpios) 376 { 377 gpio::Value value; 378 379 try 380 { 381 GPIO g{device, std::get<ucd90160::gpioNumField>(gpio), 382 Direction::input}; 383 384 value = g.read(); 385 } 386 catch (const std::exception& e) 387 { 388 if (!gpioAccessError) 389 { 390 // GPIO only throws InternalErrors - not worth committing. 391 log<level::ERR>( 392 "GPIO read failed while analyzing a power fault", 393 entry("CHIP_PATH=%s", path.c_str())); 394 395 gpioAccessError = true; 396 } 397 continue; 398 } 399 400 if (value == polarity) 401 { 402 errorFound = true; 403 404 std::string part{INVENTORY_OBJ_PATH}; 405 part = part + std::get<ucd90160::gpioCalloutField>(gpio); 406 PartCallout callout{type, part}; 407 408 if (isPartCalledOut(callout)) 409 { 410 continue; 411 } 412 413 // Look up and call the error creation function 414 auto logError = 415 std::get<ucd90160::errorFunctionField>(gpioConfig->second); 416 417 logError(*this, part); 418 419 // Save the part callout so we don't call it out again 420 setPartCallout(callout); 421 422 // Some errors (like overtemps) require a shutdown 423 auto actions = static_cast<uint32_t>( 424 std::get<ucd90160::optionFlagsField>(gpioConfig->second)); 425 426 if (actions & static_cast<decltype(actions)>( 427 ucd90160::optionFlags::shutdownOnFault)) 428 { 429 shutdown = true; 430 } 431 } 432 } 433 434 if (shutdown) 435 { 436 // Will be replaced with a GPU specific error in a future commit 437 util::powerOff<power_error::Shutdown>(bus); 438 } 439 440 return errorFound; 441 } 442 443 void UCD90160::gpuPGOODError(const std::string& callout) 444 { 445 util::NamesValues nv; 446 447 try 448 { 449 nv.add("STATUS_WORD", readStatusWord()); 450 nv.add("MFR_STATUS", readMFRStatus()); 451 } 452 catch (const device_error::ReadFailure& e) 453 { 454 log<level::ERR>("ReadFailure when collecting metadata"); 455 commit<device_error::ReadFailure>(); 456 } 457 458 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault; 459 460 report<power_error::GPUPowerFault>( 461 metadata::RAW_STATUS(nv.get().c_str()), 462 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 463 } 464 465 void UCD90160::gpuOverTempError(const std::string& callout) 466 { 467 util::NamesValues nv; 468 469 try 470 { 471 nv.add("STATUS_WORD", readStatusWord()); 472 nv.add("MFR_STATUS", readMFRStatus()); 473 } 474 catch (const device_error::ReadFailure& e) 475 { 476 log<level::ERR>("ReadFailure when collecting metadata"); 477 commit<device_error::ReadFailure>(); 478 } 479 480 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp; 481 482 report<power_error::GPUOverTemp>( 483 metadata::RAW_STATUS(nv.get().c_str()), 484 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 485 } 486 487 void UCD90160::memGoodError(const std::string& callout) 488 { 489 util::NamesValues nv; 490 491 try 492 { 493 nv.add("STATUS_WORD", readStatusWord()); 494 nv.add("MFR_STATUS", readMFRStatus()); 495 } 496 catch (const device_error::ReadFailure& e) 497 { 498 log<level::ERR>("ReadFailure when collecting metadata"); 499 commit<device_error::ReadFailure>(); 500 } 501 502 using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault; 503 504 report<power_error::MemoryPowerFault>( 505 metadata::RAW_STATUS(nv.get().c_str()), 506 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 507 } 508 509 } // namespace power 510 } // namespace phosphor 511