1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "ucd90160.hpp" 17 18 #include "names_values.hpp" 19 #include "utility.hpp" 20 21 #include <elog-errors.hpp> 22 #include <map> 23 #include <memory> 24 #include <org/open_power/Witherspoon/Fault/error.hpp> 25 #include <phosphor-logging/elog.hpp> 26 #include <phosphor-logging/log.hpp> 27 #include <xyz/openbmc_project/Common/Device/error.hpp> 28 29 namespace witherspoon 30 { 31 namespace power 32 { 33 34 using namespace std::string_literals; 35 36 const auto MFR_STATUS = "mfr_status"s; 37 38 const auto DEVICE_NAME = "UCD90160"s; 39 const auto DRIVER_NAME = "ucd9000"s; 40 constexpr auto NUM_PAGES = 16; 41 42 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory"; 43 44 namespace fs = std::experimental::filesystem; 45 using namespace gpio; 46 using namespace pmbus; 47 using namespace phosphor::logging; 48 49 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error; 50 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error; 51 52 UCD90160::UCD90160(size_t instance, sdbusplus::bus::bus& bus) : 53 Device(DEVICE_NAME, instance), 54 interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second), 55 DRIVER_NAME, instance), 56 gpioDevice(findGPIODevice(interface.path())), bus(bus) 57 { 58 } 59 60 void UCD90160::onFailure() 61 { 62 try 63 { 64 auto voutError = checkVOUTFaults(); 65 66 auto pgoodError = checkPGOODFaults(false); 67 68 // Not a voltage or PGOOD fault, but we know something 69 // failed so still create an error log. 70 if (!voutError && !pgoodError) 71 { 72 createPowerFaultLog(); 73 } 74 } 75 catch (device_error::ReadFailure& e) 76 { 77 if (!accessError) 78 { 79 commit<device_error::ReadFailure>(); 80 accessError = true; 81 } 82 } 83 } 84 85 void UCD90160::analyze() 86 { 87 try 88 { 89 // Note: Voltage faults are always fatal, so they just 90 // need to be analyzed in onFailure(). 91 92 checkPGOODFaults(true); 93 } 94 catch (device_error::ReadFailure& e) 95 { 96 if (!accessError) 97 { 98 commit<device_error::ReadFailure>(); 99 accessError = true; 100 } 101 } 102 } 103 104 uint16_t UCD90160::readStatusWord() 105 { 106 return interface.read(STATUS_WORD, Type::Debug); 107 } 108 109 uint32_t UCD90160::readMFRStatus() 110 { 111 return interface.read(MFR_STATUS, Type::HwmonDeviceDebug); 112 } 113 114 bool UCD90160::checkVOUTFaults() 115 { 116 bool errorCreated = false; 117 auto statusWord = readStatusWord(); 118 119 // The status_word register has a summary bit to tell us 120 // if each page even needs to be checked 121 if (!(statusWord & status_word::VOUT_FAULT)) 122 { 123 return errorCreated; 124 } 125 126 for (size_t page = 0; page < NUM_PAGES; page++) 127 { 128 if (isVoutFaultLogged(page)) 129 { 130 continue; 131 } 132 133 auto statusVout = interface.insertPageNum(STATUS_VOUT, page); 134 uint8_t vout = interface.read(statusVout, Type::Debug); 135 136 // If any bits are on log them, though some are just 137 // warnings so they won't cause errors 138 if (vout) 139 { 140 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT", 141 entry("STATUS_VOUT=0x%X", vout), 142 entry("PAGE=%d", page)); 143 } 144 145 // Log errors if any non-warning bits on 146 if (vout & ~status_vout::WARNING_MASK) 147 { 148 auto& railNames = std::get<ucd90160::railNamesField>( 149 deviceMap.find(getInstance())->second); 150 auto railName = railNames.at(page); 151 152 util::NamesValues nv; 153 try 154 { 155 nv.add("STATUS_WORD", statusWord); 156 nv.add("STATUS_VOUT", vout); 157 nv.add("MFR_STATUS", readMFRStatus()); 158 } 159 catch (device_error::ReadFailure& e) 160 { 161 log<level::ERR>("ReadFailure when collecting metadata"); 162 commit<device_error::ReadFailure>(); 163 } 164 165 using metadata = 166 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault; 167 168 report<power_error::PowerSequencerVoltageFault>( 169 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()), 170 metadata::RAW_STATUS(nv.get().c_str())); 171 172 setVoutFaultLogged(page); 173 errorCreated = true; 174 } 175 } 176 177 return errorCreated; 178 } 179 180 bool UCD90160::checkPGOODFaults(bool polling) 181 { 182 bool errorCreated = false; 183 184 // While PGOOD faults could show up in MFR_STATUS (and we could then 185 // check the summary bit in STATUS_WORD first), they are edge triggered, 186 // and as the device driver sends a clear faults command every time we 187 // do a read, we will never see them. So, we'll have to just read the 188 // real time GPI status GPIO. 189 190 // Check only the GPIs configured on this system. 191 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>( 192 deviceMap.find(getInstance())->second); 193 194 for (const auto& gpiConfig : gpiConfigs) 195 { 196 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig); 197 auto doPoll = std::get<ucd90160::pollField>(gpiConfig); 198 199 // Can skip this one if there is already an error on this input, 200 // or we are polling and these inputs don't need to be polled 201 //(because errors on them are fatal). 202 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll)) 203 { 204 continue; 205 } 206 207 // The real time status is read via the pin ID 208 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig); 209 auto gpio = gpios.find(pinID); 210 Value gpiStatus; 211 212 try 213 { 214 // The first time through, create the GPIO objects 215 if (gpio == gpios.end()) 216 { 217 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID, 218 Direction::input)); 219 gpio = gpios.find(pinID); 220 } 221 222 gpiStatus = gpio->second->read(); 223 } 224 catch (std::exception& e) 225 { 226 if (!accessError) 227 { 228 log<level::ERR>(e.what()); 229 accessError = true; 230 } 231 continue; 232 } 233 234 if (gpiStatus == Value::low) 235 { 236 // There may be some extra analysis we can do to narrow the 237 // error down further. Note that finding an error here won't 238 // prevent us from checking this GPI again. 239 errorCreated = doExtraAnalysis(gpiConfig); 240 241 if (errorCreated) 242 { 243 continue; 244 } 245 246 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig); 247 auto status = (gpiStatus == Value::low) ? 0 : 1; 248 249 util::NamesValues nv; 250 251 try 252 { 253 nv.add("STATUS_WORD", readStatusWord()); 254 nv.add("MFR_STATUS", readMFRStatus()); 255 nv.add("INPUT_STATUS", status); 256 } 257 catch (device_error::ReadFailure& e) 258 { 259 log<level::ERR>("ReadFailure when collecting metadata"); 260 commit<device_error::ReadFailure>(); 261 } 262 263 using metadata = 264 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault; 265 266 report<power_error::PowerSequencerPGOODFault>( 267 metadata::INPUT_NUM(gpiNum), 268 metadata::INPUT_NAME(gpiName.c_str()), 269 metadata::RAW_STATUS(nv.get().c_str())); 270 271 setPGOODFaultLogged(gpiNum); 272 errorCreated = true; 273 } 274 } 275 276 return errorCreated; 277 } 278 279 void UCD90160::createPowerFaultLog() 280 { 281 util::NamesValues nv; 282 283 try 284 { 285 nv.add("STATUS_WORD", readStatusWord()); 286 nv.add("MFR_STATUS", readMFRStatus()); 287 } 288 catch (device_error::ReadFailure& e) 289 { 290 log<level::ERR>("ReadFailure when collecting metadata"); 291 commit<device_error::ReadFailure>(); 292 } 293 294 using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault; 295 296 report<power_error::PowerSequencerFault>( 297 metadata::RAW_STATUS(nv.get().c_str())); 298 } 299 300 fs::path UCD90160::findGPIODevice(const fs::path& path) 301 { 302 fs::path gpioDevicePath; 303 304 // In the driver directory, look for a subdirectory 305 // named gpiochipX, where X is some number. Then 306 // we'll access the GPIO at /dev/gpiochipX. 307 if (fs::is_directory(path)) 308 { 309 for (auto& f : fs::directory_iterator(path)) 310 { 311 if (f.path().filename().string().find("gpiochip") != 312 std::string::npos) 313 { 314 gpioDevicePath = "/dev" / f.path().filename(); 315 break; 316 } 317 } 318 } 319 320 if (gpioDevicePath.empty()) 321 { 322 log<level::ERR>("Could not find GPIO device path", 323 entry("BASE_PATH=%s", path.c_str())); 324 } 325 326 return gpioDevicePath; 327 } 328 329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config) 330 { 331 332 auto type = std::get<ucd90160::extraAnalysisField>(config); 333 if (type == ucd90160::extraAnalysisType::none) 334 { 335 return false; 336 } 337 338 // Currently the only extra analysis to do is to check other GPIOs. 339 return doGPIOAnalysis(type); 340 } 341 342 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type) 343 { 344 bool errorFound = false; 345 bool shutdown = false; 346 347 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>( 348 deviceMap.find(getInstance())->second); 349 350 auto gpioConfig = analysisConfig.find(type); 351 if (gpioConfig == analysisConfig.end()) 352 { 353 return errorFound; 354 } 355 356 auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second); 357 358 // The /dev/gpiochipX device 359 auto device = findGPIODevice(path); 360 361 // The GPIO value of the fault condition 362 auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second); 363 364 // The GPIOs to check 365 auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second); 366 367 for (const auto& gpio : gpios) 368 { 369 gpio::Value value; 370 371 try 372 { 373 GPIO g{device, std::get<ucd90160::gpioNumField>(gpio), 374 Direction::input}; 375 376 value = g.read(); 377 } 378 catch (std::exception& e) 379 { 380 if (!gpioAccessError) 381 { 382 // GPIO only throws InternalErrors - not worth committing. 383 log<level::ERR>( 384 "GPIO read failed while analyzing a power fault", 385 entry("CHIP_PATH=%s", path.c_str())); 386 387 gpioAccessError = true; 388 } 389 continue; 390 } 391 392 if (value == polarity) 393 { 394 errorFound = true; 395 396 std::string part{INVENTORY_OBJ_PATH}; 397 part = part + std::get<ucd90160::gpioCalloutField>(gpio); 398 PartCallout callout{type, part}; 399 400 if (isPartCalledOut(callout)) 401 { 402 continue; 403 } 404 405 // Look up and call the error creation function 406 auto logError = 407 std::get<ucd90160::errorFunctionField>(gpioConfig->second); 408 409 logError(*this, part); 410 411 // Save the part callout so we don't call it out again 412 setPartCallout(callout); 413 414 // Some errors (like overtemps) require a shutdown 415 auto actions = static_cast<uint32_t>( 416 std::get<ucd90160::optionFlagsField>(gpioConfig->second)); 417 418 if (actions & static_cast<decltype(actions)>( 419 ucd90160::optionFlags::shutdownOnFault)) 420 { 421 shutdown = true; 422 } 423 } 424 } 425 426 if (shutdown) 427 { 428 // Will be replaced with a GPU specific error in a future commit 429 util::powerOff<power_error::Shutdown>(bus); 430 } 431 432 return errorFound; 433 } 434 435 void UCD90160::gpuPGOODError(const std::string& callout) 436 { 437 util::NamesValues nv; 438 439 try 440 { 441 nv.add("STATUS_WORD", readStatusWord()); 442 nv.add("MFR_STATUS", readMFRStatus()); 443 } 444 catch (device_error::ReadFailure& e) 445 { 446 log<level::ERR>("ReadFailure when collecting metadata"); 447 commit<device_error::ReadFailure>(); 448 } 449 450 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault; 451 452 report<power_error::GPUPowerFault>( 453 metadata::RAW_STATUS(nv.get().c_str()), 454 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 455 } 456 457 void UCD90160::gpuOverTempError(const std::string& callout) 458 { 459 util::NamesValues nv; 460 461 try 462 { 463 nv.add("STATUS_WORD", readStatusWord()); 464 nv.add("MFR_STATUS", readMFRStatus()); 465 } 466 catch (device_error::ReadFailure& e) 467 { 468 log<level::ERR>("ReadFailure when collecting metadata"); 469 commit<device_error::ReadFailure>(); 470 } 471 472 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp; 473 474 report<power_error::GPUOverTemp>( 475 metadata::RAW_STATUS(nv.get().c_str()), 476 metadata::CALLOUT_INVENTORY_PATH(callout.c_str())); 477 } 478 479 } // namespace power 480 } // namespace witherspoon 481