1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "ucd90160.hpp" 17 18 #include "names_values.hpp" 19 #include "utility.hpp" 20 21 #include <elog-errors.hpp> 22 #include <org/open_power/Witherspoon/Fault/error.hpp> 23 #include <phosphor-logging/elog.hpp> 24 #include <phosphor-logging/log.hpp> 25 #include <xyz/openbmc_project/Common/Device/error.hpp> 26 27 #include <map> 28 #include <memory> 29 30 namespace phosphor 31 { 32 namespace power 33 { 34 35 using namespace std::string_literals; 36 37 const auto MFR_STATUS = "mfr_status"s; 38 39 const auto DEVICE_NAME = "UCD90160"s; 40 const auto DRIVER_NAME = "ucd9000"s; 41 constexpr auto NUM_PAGES = 16; 42 43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory"; 44 45 namespace fs = std::filesystem; 46 using namespace gpio; 47 using namespace pmbus; 48 using namespace phosphor::logging; 49 50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error; 51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error; 52 53 UCD90160::UCD90160(size_t instance, sdbusplus::bus_t& bus) : 54 Device(DEVICE_NAME, instance), 55 interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second), 56 DRIVER_NAME, instance), 57 gpioDevice(findGPIODevice(interface.path())), bus(bus) 58 {} 59 60 void UCD90160::onFailure() 61 { 62 try 63 { 64 auto voutError = checkVOUTFaults(); 65 66 auto pgoodError = checkPGOODFaults(false); 67 68 // Not a voltage or PGOOD fault, but we know something 69 // failed so still create an error log. 70 if (!voutError && !pgoodError) 71 { 72 createPowerFaultLog(); 73 } 74 } 75 catch (const device_error::ReadFailure& e) 76 { 77 if (!accessError) 78 { 79 commit<device_error::ReadFailure>(); 80 accessError = true; 81 } 82 } 83 } 84 85 void UCD90160::analyze() 86 { 87 try 88 { 89 // Note: Voltage faults are always fatal, so they just 90 // need to be analyzed in onFailure(). 91 92 checkPGOODFaults(true); 93 } 94 catch (const device_error::ReadFailure& e) 95 { 96 if (!accessError) 97 { 98 commit<device_error::ReadFailure>(); 99 accessError = true; 100 } 101 } 102 } 103 104 uint16_t UCD90160::readStatusWord() 105 { 106 return interface.read(STATUS_WORD, Type::Debug); 107 } 108 109 uint32_t UCD90160::readMFRStatus() 110 { 111 return interface.read(MFR_STATUS, Type::HwmonDeviceDebug); 112 } 113 114 bool UCD90160::checkVOUTFaults() 115 { 116 bool errorCreated = false; 117 auto statusWord = readStatusWord(); 118 119 // The status_word register has a summary bit to tell us 120 // if each page even needs to be checked 121 if (!(statusWord & status_word::VOUT_FAULT)) 122 { 123 return errorCreated; 124 } 125 126 for (size_t page = 0; page < NUM_PAGES; page++) 127 { 128 if (isVoutFaultLogged(page)) 129 { 130 continue; 131 } 132 133 auto statusVout = interface.insertPageNum(STATUS_VOUT, page); 134 uint8_t vout = interface.read(statusVout, Type::Debug); 135 136 // If any bits are on log them, though some are just 137 // warnings so they won't cause errors 138 if (vout) 139 { 140 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT", 141 entry("STATUS_VOUT=0x%X", vout), 142 entry("PAGE=%d", page)); 143 } 144 145 // Log errors if any non-warning bits on 146 if (vout & ~status_vout::WARNING_MASK) 147 { 148 auto& railNames = std::get<ucd90160::railNamesField>( 149 deviceMap.find(getInstance())->second); 150 auto railName = railNames.at(page); 151 152 util::NamesValues nv; 153 try 154 { 155 nv.add("STATUS_WORD", statusWord); 156 nv.add("STATUS_VOUT", vout); 157 nv.add("MFR_STATUS", readMFRStatus()); 158 } 159 catch (const device_error::ReadFailure& e) 160 { 161 log<level::ERR>("ReadFailure when collecting metadata"); 162 commit<device_error::ReadFailure>(); 163 } 164 165 using metadata = 166 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault; 167 168 report<power_error::PowerSequencerVoltageFault>( 169 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()), 170 metadata::RAW_STATUS(nv.get().c_str())); 171 172 setVoutFaultLogged(page); 173 errorCreated = true; 174 } 175 } 176 177 return errorCreated; 178 } 179 180 bool UCD90160::checkPGOODFaults(bool polling) 181 { 182 bool errorCreated = false; 183 184 // While PGOOD faults could show up in MFR_STATUS (and we could then 185 // check the summary bit in STATUS_WORD first), they are edge triggered, 186 // and as the device driver sends a clear faults command every time we 187 // do a read, we will never see them. So, we'll have to just read the 188 // real time GPI status GPIO. 189 190 // Check only the GPIs configured on this system. 191 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>( 192 deviceMap.find(getInstance())->second); 193 194 for (const auto& gpiConfig : gpiConfigs) 195 { 196 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig); 197 auto doPoll = std::get<ucd90160::pollField>(gpiConfig); 198 199 // Can skip this one if there is already an error on this input, 200 // or we are polling and these inputs don't need to be polled 201 //(because errors on them are fatal). 202 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll)) 203 { 204 continue; 205 } 206 207 // The real time status is read via the pin ID 208 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig); 209 auto gpio = gpios.find(pinID); 210 Value gpiStatus; 211 212 try 213 { 214 // The first time through, create the GPIO objects 215 if (gpio == gpios.end()) 216 { 217 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID, 218 Direction::input)); 219 gpio = gpios.find(pinID); 220 } 221 222 gpiStatus = gpio->second->read(); 223 } 224 catch (const std::exception& e) 225 { 226 if (!accessError) 227 { 228 log<level::ERR>(e.what()); 229 accessError = true; 230 } 231 continue; 232 } 233 234 if (gpiStatus == Value::low) 235 { 236 // There may be some extra analysis we can do to narrow the 237 // error down further. Note that finding an error here won't 238 // prevent us from checking this GPI again. 239 errorCreated = doExtraAnalysis(gpiConfig); 240 241 if (errorCreated) 242 { 243 continue; 244 } 245 246 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig); 247 auto status = (gpiStatus == Value::low) ? 0 : 1; 248 249 util::NamesValues nv; 250 251 try 252 { 253 nv.add("STATUS_WORD", readStatusWord()); 254 nv.add("MFR_STATUS", readMFRStatus()); 255 nv.add("INPUT_STATUS", status); 256 } 257 catch (const device_error::ReadFailure& e) 258 { 259 log<level::ERR>("ReadFailure when collecting metadata"); 260 commit<device_error::ReadFailure>(); 261 } 262 263 using metadata = 264 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault; 265 266 report<power_error::PowerSequencerPGOODFault>( 267 metadata::INPUT_NUM(gpiNum), 268 metadata::INPUT_NAME(gpiName.c_str()), 269 metadata::RAW_STATUS(nv.get().c_str())); 270 271 setPGOODFaultLogged(gpiNum); 272 errorCreated = true; 273 } 274 } 275 276 return errorCreated; 277 } 278 279 void UCD90160::createPowerFaultLog() 280 { 281 util::NamesValues nv; 282 283 try 284 { 285 nv.add("STATUS_WORD", readStatusWord()); 286 nv.add("MFR_STATUS", readMFRStatus()); 287 } 288 catch (const device_error::ReadFailure& e) 289 { 290 log<level::ERR>("ReadFailure when collecting metadata"); 291 commit<device_error::ReadFailure>(); 292 } 293 294 using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault; 295 296 report<power_error::PowerSequencerFault>( 297 metadata::RAW_STATUS(nv.get().c_str())); 298 } 299 300 fs::path UCD90160::findGPIODevice(const fs::path& path) 301 { 302 fs::path gpioDevicePath; 303 304 // In the driver directory, look for a subdirectory 305 // named gpiochipX, where X is some number. Then 306 // we'll access the GPIO at /dev/gpiochipX. 307 if (fs::is_directory(path)) 308 { 309 for (auto& f : fs::directory_iterator(path)) 310 { 311 if (f.path().filename().string().find("gpiochip") != 312 std::string::npos) 313 { 314 gpioDevicePath = "/dev" / f.path().filename(); 315 break; 316 } 317 } 318 } 319 320 if (gpioDevicePath.empty()) 321 { 322 log<level::ERR>("Could not find GPIO device path", 323 entry("BASE_PATH=%s", path.c_str())); 324 } 325 326 return gpioDevicePath; 327 } 328 329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config) 330 { 331 auto type = std::get<ucd90160::extraAnalysisField>(config); 332 if (type == ucd90160::extraAnalysisType::none) 333 { 334 return false; 335 } 336 337 // Currently the only extra analysis to do is to check other GPIOs. 338 return doGPIOAnalysis(type); 339 } 340 341 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type) 342 { 343 bool errorFound = false; 344 bool shutdown = false; 345 346 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>( 347 deviceMap.find(getInstance())->second); 348 349 auto gpioConfig = analysisConfig.find(type); 350 if (gpioConfig == analysisConfig.end()) 351 { 352 return errorFound; 353 } 354 355 auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second); 356 357 // The /dev/gpiochipX device 358 auto device = findGPIODevice(path); 359 360 if (device.empty()) 361 { 362 log<level::ERR>( 363 "Missing GPIO device - cannot do GPIO analysis of fault", 364 entry("ANALYSIS_TYPE=%d\n", type)); 365 return errorFound; 366 } 367 368 // The GPIO value of the fault condition 369 auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second); 370 371 // The GPIOs to check 372 auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second); 373 374 for (const auto& gpio : gpios) 375 { 376 gpio::Value value; 377 378 try 379 { 380 GPIO g{device, std::get<ucd90160::gpioNumField>(gpio), 381 Direction::input}; 382 383 value = g.read(); 384 } 385 catch (const std::exception& e) 386 { 387 if (!gpioAccessError) 388 { 389 // GPIO only throws InternalErrors - not worth committing. 390 log<level::ERR>( 391 "GPIO read failed while analyzing a power fault", 392 entry("CHIP_PATH=%s", path.c_str())); 393 394 gpioAccessError = true; 395 } 396 continue; 397 } 398 399 if (value == polarity) 400 { 401 errorFound = true; 402 403 std::string part{INVENTORY_OBJ_PATH}; 404 part = part + std::get<ucd90160::gpioCalloutField>(gpio); 405 PartCallout callout{type, part}; 406 407 if (isPartCalledOut(callout)) 408 { 409 continue; 410 } 411 412 // Look up and call the error creation function 413 auto logError = 414 std::get<ucd90160::errorFunctionField>(gpioConfig->second); 415 416 logError(*this, part); 417 418 // Save the part callout so we don't call it out again 419 setPartCallout(callout); 420 421 // Some errors (like overtemps) require a shutdown 422 auto actions = static_cast<uint32_t>( 423 std::get<ucd90160::optionFlagsField>(gpioConfig->second)); 424 425 if (actions & static_cast<decltype(actions)>( 426 ucd90160::optionFlags::shutdownOnFault)) 427 { 428 shutdown = true; 429 } 430 } 431 } 432 433 if (shutdown) 434 { 435 // Will be replaced with a GPU specific error in a future commit 436 util::powerOff<power_error::Shutdown>(bus); 437 } 438 439 return errorFound; 440 } 441 442 void UCD90160::gpuPGOODError(const std::string& callout) 443 { 444 util::NamesValues nv; 445 446 try 447 { 448 nv.add("STATUS_WORD", readStatusWord()); 449 nv.add("MFR_STATUS", readMFRStatus()); 450 } 451 catch (const device_error::ReadFailure& e) 452 { 453 log<level::ERR>("ReadFailure when collecting metadata"); 454 commit<device_error::ReadFailure>(); 455 } 456 457 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault; 458 459 report<power_error::GPUPowerFault>( 460 metadata::RAW_STATUS(nv.get().c_str()), 461 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 462 } 463 464 void UCD90160::gpuOverTempError(const std::string& callout) 465 { 466 util::NamesValues nv; 467 468 try 469 { 470 nv.add("STATUS_WORD", readStatusWord()); 471 nv.add("MFR_STATUS", readMFRStatus()); 472 } 473 catch (const device_error::ReadFailure& e) 474 { 475 log<level::ERR>("ReadFailure when collecting metadata"); 476 commit<device_error::ReadFailure>(); 477 } 478 479 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp; 480 481 report<power_error::GPUOverTemp>( 482 metadata::RAW_STATUS(nv.get().c_str()), 483 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 484 } 485 486 void UCD90160::memGoodError(const std::string& callout) 487 { 488 util::NamesValues nv; 489 490 try 491 { 492 nv.add("STATUS_WORD", readStatusWord()); 493 nv.add("MFR_STATUS", readMFRStatus()); 494 } 495 catch (const device_error::ReadFailure& e) 496 { 497 log<level::ERR>("ReadFailure when collecting metadata"); 498 commit<device_error::ReadFailure>(); 499 } 500 501 using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault; 502 503 report<power_error::MemoryPowerFault>( 504 metadata::RAW_STATUS(nv.get().c_str()), 505 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 506 } 507 508 } // namespace power 509 } // namespace phosphor 510