1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "ucd90160.hpp" 17 18 #include "names_values.hpp" 19 #include "utility.hpp" 20 21 #include <elog-errors.hpp> 22 #include <org/open_power/Witherspoon/Fault/error.hpp> 23 #include <phosphor-logging/elog.hpp> 24 #include <phosphor-logging/log.hpp> 25 #include <xyz/openbmc_project/Common/Device/error.hpp> 26 27 #include <map> 28 #include <memory> 29 30 namespace phosphor 31 { 32 namespace power 33 { 34 35 using namespace std::string_literals; 36 37 const auto MFR_STATUS = "mfr_status"s; 38 39 const auto DEVICE_NAME = "UCD90160"s; 40 const auto DRIVER_NAME = "ucd9000"s; 41 constexpr auto NUM_PAGES = 16; 42 43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory"; 44 45 namespace fs = std::filesystem; 46 using namespace gpio; 47 using namespace pmbus; 48 using namespace phosphor::logging; 49 50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error; 51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error; 52 53 UCD90160::UCD90160(size_t instance, sdbusplus::bus::bus& bus) : 54 Device(DEVICE_NAME, instance), 55 interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second), 56 DRIVER_NAME, instance), 57 gpioDevice(findGPIODevice(interface.path())), bus(bus) 58 { 59 } 60 61 void UCD90160::onFailure() 62 { 63 try 64 { 65 auto voutError = checkVOUTFaults(); 66 67 auto pgoodError = checkPGOODFaults(false); 68 69 // Not a voltage or PGOOD fault, but we know something 70 // failed so still create an error log. 71 if (!voutError && !pgoodError) 72 { 73 createPowerFaultLog(); 74 } 75 } 76 catch (device_error::ReadFailure& e) 77 { 78 if (!accessError) 79 { 80 commit<device_error::ReadFailure>(); 81 accessError = true; 82 } 83 } 84 } 85 86 void UCD90160::analyze() 87 { 88 try 89 { 90 // Note: Voltage faults are always fatal, so they just 91 // need to be analyzed in onFailure(). 92 93 checkPGOODFaults(true); 94 } 95 catch (device_error::ReadFailure& e) 96 { 97 if (!accessError) 98 { 99 commit<device_error::ReadFailure>(); 100 accessError = true; 101 } 102 } 103 } 104 105 uint16_t UCD90160::readStatusWord() 106 { 107 return interface.read(STATUS_WORD, Type::Debug); 108 } 109 110 uint32_t UCD90160::readMFRStatus() 111 { 112 return interface.read(MFR_STATUS, Type::HwmonDeviceDebug); 113 } 114 115 bool UCD90160::checkVOUTFaults() 116 { 117 bool errorCreated = false; 118 auto statusWord = readStatusWord(); 119 120 // The status_word register has a summary bit to tell us 121 // if each page even needs to be checked 122 if (!(statusWord & status_word::VOUT_FAULT)) 123 { 124 return errorCreated; 125 } 126 127 for (size_t page = 0; page < NUM_PAGES; page++) 128 { 129 if (isVoutFaultLogged(page)) 130 { 131 continue; 132 } 133 134 auto statusVout = interface.insertPageNum(STATUS_VOUT, page); 135 uint8_t vout = interface.read(statusVout, Type::Debug); 136 137 // If any bits are on log them, though some are just 138 // warnings so they won't cause errors 139 if (vout) 140 { 141 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT", 142 entry("STATUS_VOUT=0x%X", vout), 143 entry("PAGE=%d", page)); 144 } 145 146 // Log errors if any non-warning bits on 147 if (vout & ~status_vout::WARNING_MASK) 148 { 149 auto& railNames = std::get<ucd90160::railNamesField>( 150 deviceMap.find(getInstance())->second); 151 auto railName = railNames.at(page); 152 153 util::NamesValues nv; 154 try 155 { 156 nv.add("STATUS_WORD", statusWord); 157 nv.add("STATUS_VOUT", vout); 158 nv.add("MFR_STATUS", readMFRStatus()); 159 } 160 catch (device_error::ReadFailure& e) 161 { 162 log<level::ERR>("ReadFailure when collecting metadata"); 163 commit<device_error::ReadFailure>(); 164 } 165 166 using metadata = 167 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault; 168 169 report<power_error::PowerSequencerVoltageFault>( 170 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()), 171 metadata::RAW_STATUS(nv.get().c_str())); 172 173 setVoutFaultLogged(page); 174 errorCreated = true; 175 } 176 } 177 178 return errorCreated; 179 } 180 181 bool UCD90160::checkPGOODFaults(bool polling) 182 { 183 bool errorCreated = false; 184 185 // While PGOOD faults could show up in MFR_STATUS (and we could then 186 // check the summary bit in STATUS_WORD first), they are edge triggered, 187 // and as the device driver sends a clear faults command every time we 188 // do a read, we will never see them. So, we'll have to just read the 189 // real time GPI status GPIO. 190 191 // Check only the GPIs configured on this system. 192 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>( 193 deviceMap.find(getInstance())->second); 194 195 for (const auto& gpiConfig : gpiConfigs) 196 { 197 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig); 198 auto doPoll = std::get<ucd90160::pollField>(gpiConfig); 199 200 // Can skip this one if there is already an error on this input, 201 // or we are polling and these inputs don't need to be polled 202 //(because errors on them are fatal). 203 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll)) 204 { 205 continue; 206 } 207 208 // The real time status is read via the pin ID 209 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig); 210 auto gpio = gpios.find(pinID); 211 Value gpiStatus; 212 213 try 214 { 215 // The first time through, create the GPIO objects 216 if (gpio == gpios.end()) 217 { 218 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID, 219 Direction::input)); 220 gpio = gpios.find(pinID); 221 } 222 223 gpiStatus = gpio->second->read(); 224 } 225 catch (std::exception& e) 226 { 227 if (!accessError) 228 { 229 log<level::ERR>(e.what()); 230 accessError = true; 231 } 232 continue; 233 } 234 235 if (gpiStatus == Value::low) 236 { 237 // There may be some extra analysis we can do to narrow the 238 // error down further. Note that finding an error here won't 239 // prevent us from checking this GPI again. 240 errorCreated = doExtraAnalysis(gpiConfig); 241 242 if (errorCreated) 243 { 244 continue; 245 } 246 247 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig); 248 auto status = (gpiStatus == Value::low) ? 0 : 1; 249 250 util::NamesValues nv; 251 252 try 253 { 254 nv.add("STATUS_WORD", readStatusWord()); 255 nv.add("MFR_STATUS", readMFRStatus()); 256 nv.add("INPUT_STATUS", status); 257 } 258 catch (device_error::ReadFailure& e) 259 { 260 log<level::ERR>("ReadFailure when collecting metadata"); 261 commit<device_error::ReadFailure>(); 262 } 263 264 using metadata = 265 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault; 266 267 report<power_error::PowerSequencerPGOODFault>( 268 metadata::INPUT_NUM(gpiNum), 269 metadata::INPUT_NAME(gpiName.c_str()), 270 metadata::RAW_STATUS(nv.get().c_str())); 271 272 setPGOODFaultLogged(gpiNum); 273 errorCreated = true; 274 } 275 } 276 277 return errorCreated; 278 } 279 280 void UCD90160::createPowerFaultLog() 281 { 282 util::NamesValues nv; 283 284 try 285 { 286 nv.add("STATUS_WORD", readStatusWord()); 287 nv.add("MFR_STATUS", readMFRStatus()); 288 } 289 catch (device_error::ReadFailure& e) 290 { 291 log<level::ERR>("ReadFailure when collecting metadata"); 292 commit<device_error::ReadFailure>(); 293 } 294 295 using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault; 296 297 report<power_error::PowerSequencerFault>( 298 metadata::RAW_STATUS(nv.get().c_str())); 299 } 300 301 fs::path UCD90160::findGPIODevice(const fs::path& path) 302 { 303 fs::path gpioDevicePath; 304 305 // In the driver directory, look for a subdirectory 306 // named gpiochipX, where X is some number. Then 307 // we'll access the GPIO at /dev/gpiochipX. 308 if (fs::is_directory(path)) 309 { 310 for (auto& f : fs::directory_iterator(path)) 311 { 312 if (f.path().filename().string().find("gpiochip") != 313 std::string::npos) 314 { 315 gpioDevicePath = "/dev" / f.path().filename(); 316 break; 317 } 318 } 319 } 320 321 if (gpioDevicePath.empty()) 322 { 323 log<level::ERR>("Could not find GPIO device path", 324 entry("BASE_PATH=%s", path.c_str())); 325 } 326 327 return gpioDevicePath; 328 } 329 330 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config) 331 { 332 333 auto type = std::get<ucd90160::extraAnalysisField>(config); 334 if (type == ucd90160::extraAnalysisType::none) 335 { 336 return false; 337 } 338 339 // Currently the only extra analysis to do is to check other GPIOs. 340 return doGPIOAnalysis(type); 341 } 342 343 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type) 344 { 345 bool errorFound = false; 346 bool shutdown = false; 347 348 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>( 349 deviceMap.find(getInstance())->second); 350 351 auto gpioConfig = analysisConfig.find(type); 352 if (gpioConfig == analysisConfig.end()) 353 { 354 return errorFound; 355 } 356 357 auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second); 358 359 // The /dev/gpiochipX device 360 auto device = findGPIODevice(path); 361 362 if (device.empty()) 363 { 364 log<level::ERR>( 365 "Missing GPIO device - cannot do GPIO analysis of fault", 366 entry("ANALYSIS_TYPE=%d\n", type)); 367 return errorFound; 368 } 369 370 // The GPIO value of the fault condition 371 auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second); 372 373 // The GPIOs to check 374 auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second); 375 376 for (const auto& gpio : gpios) 377 { 378 gpio::Value value; 379 380 try 381 { 382 GPIO g{device, std::get<ucd90160::gpioNumField>(gpio), 383 Direction::input}; 384 385 value = g.read(); 386 } 387 catch (std::exception& e) 388 { 389 if (!gpioAccessError) 390 { 391 // GPIO only throws InternalErrors - not worth committing. 392 log<level::ERR>( 393 "GPIO read failed while analyzing a power fault", 394 entry("CHIP_PATH=%s", path.c_str())); 395 396 gpioAccessError = true; 397 } 398 continue; 399 } 400 401 if (value == polarity) 402 { 403 errorFound = true; 404 405 std::string part{INVENTORY_OBJ_PATH}; 406 part = part + std::get<ucd90160::gpioCalloutField>(gpio); 407 PartCallout callout{type, part}; 408 409 if (isPartCalledOut(callout)) 410 { 411 continue; 412 } 413 414 // Look up and call the error creation function 415 auto logError = 416 std::get<ucd90160::errorFunctionField>(gpioConfig->second); 417 418 logError(*this, part); 419 420 // Save the part callout so we don't call it out again 421 setPartCallout(callout); 422 423 // Some errors (like overtemps) require a shutdown 424 auto actions = static_cast<uint32_t>( 425 std::get<ucd90160::optionFlagsField>(gpioConfig->second)); 426 427 if (actions & static_cast<decltype(actions)>( 428 ucd90160::optionFlags::shutdownOnFault)) 429 { 430 shutdown = true; 431 } 432 } 433 } 434 435 if (shutdown) 436 { 437 // Will be replaced with a GPU specific error in a future commit 438 util::powerOff<power_error::Shutdown>(bus); 439 } 440 441 return errorFound; 442 } 443 444 void UCD90160::gpuPGOODError(const std::string& callout) 445 { 446 util::NamesValues nv; 447 448 try 449 { 450 nv.add("STATUS_WORD", readStatusWord()); 451 nv.add("MFR_STATUS", readMFRStatus()); 452 } 453 catch (device_error::ReadFailure& e) 454 { 455 log<level::ERR>("ReadFailure when collecting metadata"); 456 commit<device_error::ReadFailure>(); 457 } 458 459 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault; 460 461 report<power_error::GPUPowerFault>( 462 metadata::RAW_STATUS(nv.get().c_str()), 463 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 464 } 465 466 void UCD90160::gpuOverTempError(const std::string& callout) 467 { 468 util::NamesValues nv; 469 470 try 471 { 472 nv.add("STATUS_WORD", readStatusWord()); 473 nv.add("MFR_STATUS", readMFRStatus()); 474 } 475 catch (device_error::ReadFailure& e) 476 { 477 log<level::ERR>("ReadFailure when collecting metadata"); 478 commit<device_error::ReadFailure>(); 479 } 480 481 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp; 482 483 report<power_error::GPUOverTemp>( 484 metadata::RAW_STATUS(nv.get().c_str()), 485 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 486 } 487 488 void UCD90160::memGoodError(const std::string& callout) 489 { 490 util::NamesValues nv; 491 492 try 493 { 494 nv.add("STATUS_WORD", readStatusWord()); 495 nv.add("MFR_STATUS", readMFRStatus()); 496 } 497 catch (device_error::ReadFailure& e) 498 { 499 log<level::ERR>("ReadFailure when collecting metadata"); 500 commit<device_error::ReadFailure>(); 501 } 502 503 using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault; 504 505 report<power_error::MemoryPowerFault>( 506 metadata::RAW_STATUS(nv.get().c_str()), 507 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 508 } 509 510 } // namespace power 511 } // namespace phosphor 512