1 /** 2 * Copyright © 2017 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include <map> 17 #include <memory> 18 #include <phosphor-logging/elog.hpp> 19 #include <phosphor-logging/log.hpp> 20 #include <elog-errors.hpp> 21 #include <org/open_power/Witherspoon/Fault/error.hpp> 22 #include <xyz/openbmc_project/Common/Device/error.hpp> 23 #include "names_values.hpp" 24 #include "ucd90160.hpp" 25 #include "utility.hpp" 26 27 namespace witherspoon 28 { 29 namespace power 30 { 31 32 using namespace std::string_literals; 33 34 const auto MFR_STATUS = "mfr_status"s; 35 36 const auto DEVICE_NAME = "UCD90160"s; 37 const auto DRIVER_NAME = "ucd9000"s; 38 constexpr auto NUM_PAGES = 16; 39 40 namespace fs = std::experimental::filesystem; 41 using namespace gpio; 42 using namespace pmbus; 43 using namespace phosphor::logging; 44 45 namespace device_error = sdbusplus::xyz::openbmc_project:: 46 Common::Device::Error; 47 namespace power_error = sdbusplus::org::open_power:: 48 Witherspoon::Fault::Error; 49 50 UCD90160::UCD90160(size_t instance, sdbusplus::bus::bus& bus) : 51 Device(DEVICE_NAME, instance), 52 interface(std::get<ucd90160::pathField>( 53 deviceMap.find(instance)->second), 54 DRIVER_NAME, 55 instance), 56 gpioDevice(findGPIODevice(interface.path())), 57 bus(bus) 58 { 59 } 60 61 void UCD90160::onFailure() 62 { 63 try 64 { 65 auto voutError = checkVOUTFaults(); 66 67 auto pgoodError = checkPGOODFaults(false); 68 69 //Not a voltage or PGOOD fault, but we know something 70 //failed so still create an error log. 71 if (!voutError && !pgoodError) 72 { 73 createPowerFaultLog(); 74 } 75 } 76 catch (device_error::ReadFailure& e) 77 { 78 if (!accessError) 79 { 80 commit<device_error::ReadFailure>(); 81 accessError = true; 82 } 83 } 84 } 85 86 void UCD90160::analyze() 87 { 88 try 89 { 90 //Note: Voltage faults are always fatal, so they just 91 //need to be analyzed in onFailure(). 92 93 checkPGOODFaults(true); 94 } 95 catch (device_error::ReadFailure& e) 96 { 97 if (!accessError) 98 { 99 commit<device_error::ReadFailure>(); 100 accessError = true; 101 } 102 } 103 } 104 105 uint16_t UCD90160::readStatusWord() 106 { 107 return interface.read(STATUS_WORD, Type::Debug); 108 } 109 110 uint32_t UCD90160::readMFRStatus() 111 { 112 return interface.read(MFR_STATUS, Type::DeviceDebug); 113 } 114 115 bool UCD90160::checkVOUTFaults() 116 { 117 bool errorCreated = false; 118 auto statusWord = readStatusWord(); 119 120 //The status_word register has a summary bit to tell us 121 //if each page even needs to be checked 122 if (!(statusWord & status_word::VOUT_FAULT)) 123 { 124 return errorCreated; 125 } 126 127 for (size_t page = 0; page < NUM_PAGES; page++) 128 { 129 if (isVoutFaultLogged(page)) 130 { 131 continue; 132 } 133 134 auto statusVout = interface.insertPageNum(STATUS_VOUT, page); 135 uint8_t vout = interface.read(statusVout, Type::Debug); 136 137 //If any bits are on log them, though some are just 138 //warnings so they won't cause errors 139 if (vout) 140 { 141 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT", 142 entry("STATUS_VOUT=0x%X", vout), 143 entry("PAGE=%d", page)); 144 } 145 146 //Log errors if any non-warning bits on 147 if (vout & ~status_vout::WARNING_MASK) 148 { 149 auto& railNames = std::get<ucd90160::railNamesField>( 150 deviceMap.find(getInstance())->second); 151 auto railName = railNames.at(page); 152 153 util::NamesValues nv; 154 nv.add("STATUS_WORD", statusWord); 155 nv.add("STATUS_VOUT", vout); 156 nv.add("MFR_STATUS", readMFRStatus()); 157 158 using metadata = org::open_power::Witherspoon::Fault:: 159 PowerSequencerVoltageFault; 160 161 report<power_error::PowerSequencerVoltageFault>( 162 metadata::RAIL(page), 163 metadata::RAIL_NAME(railName.c_str()), 164 metadata::RAW_STATUS(nv.get().c_str())); 165 166 setVoutFaultLogged(page); 167 errorCreated = true; 168 } 169 } 170 171 return errorCreated; 172 } 173 174 bool UCD90160::checkPGOODFaults(bool polling) 175 { 176 bool errorCreated = false; 177 178 //While PGOOD faults could show up in MFR_STATUS (and we could then 179 //check the summary bit in STATUS_WORD first), they are edge triggered, 180 //and as the device driver sends a clear faults command every time we 181 //do a read, we will never see them. So, we'll have to just read the 182 //real time GPI status GPIO. 183 184 //Check only the GPIs configured on this system. 185 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>( 186 deviceMap.find(getInstance())->second); 187 188 for (const auto& gpiConfig : gpiConfigs) 189 { 190 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig); 191 auto doPoll = std::get<ucd90160::pollField>(gpiConfig); 192 193 //Can skip this one if there is already an error on this input, 194 //or we are polling and these inputs don't need to be polled 195 //(because errors on them are fatal). 196 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll)) 197 { 198 continue; 199 } 200 201 //The real time status is read via the pin ID 202 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig); 203 auto gpio = gpios.find(pinID); 204 Value gpiStatus; 205 206 try 207 { 208 //The first time through, create the GPIO objects 209 if (gpio == gpios.end()) 210 { 211 gpios.emplace( 212 pinID, 213 std::make_unique<GPIO>( 214 gpioDevice, pinID, Direction::input)); 215 gpio = gpios.find(pinID); 216 } 217 218 gpiStatus = gpio->second->read(); 219 } 220 catch (std::exception& e) 221 { 222 if (!accessError) 223 { 224 log<level::ERR>(e.what()); 225 accessError = true; 226 } 227 continue; 228 } 229 230 if (gpiStatus == Value::low) 231 { 232 //There may be some extra analysis we can do to narrow the 233 //error down further. Note that finding an error here won't 234 //prevent us from checking this GPI again. 235 errorCreated = doExtraAnalysis(gpiConfig); 236 237 if (errorCreated) 238 { 239 continue; 240 } 241 242 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig); 243 auto status = (gpiStatus == Value::low) ? 0 : 1; 244 245 util::NamesValues nv; 246 nv.add("STATUS_WORD", readStatusWord()); 247 nv.add("MFR_STATUS", readMFRStatus()); 248 nv.add("INPUT_STATUS", status); 249 250 using metadata = org::open_power::Witherspoon::Fault:: 251 PowerSequencerPGOODFault; 252 253 report<power_error::PowerSequencerPGOODFault>( 254 metadata::INPUT_NUM(gpiNum), 255 metadata::INPUT_NAME(gpiName.c_str()), 256 metadata::RAW_STATUS(nv.get().c_str())); 257 258 setPGOODFaultLogged(gpiNum); 259 errorCreated = true; 260 } 261 } 262 263 return errorCreated; 264 } 265 266 void UCD90160::createPowerFaultLog() 267 { 268 util::NamesValues nv; 269 nv.add("STATUS_WORD", readStatusWord()); 270 nv.add("MFR_STATUS", readMFRStatus()); 271 272 using metadata = org::open_power::Witherspoon::Fault:: 273 PowerSequencerFault; 274 275 report<power_error::PowerSequencerFault>( 276 metadata::RAW_STATUS(nv.get().c_str())); 277 } 278 279 fs::path UCD90160::findGPIODevice(const fs::path& path) 280 { 281 fs::path gpioDevicePath; 282 283 //In the driver directory, look for a subdirectory 284 //named gpiochipX, where X is some number. Then 285 //we'll access the GPIO at /dev/gpiochipX. 286 if (fs::is_directory(path)) 287 { 288 for (auto& f : fs::directory_iterator(path)) 289 { 290 if (f.path().filename().string().find("gpiochip") != 291 std::string::npos) 292 { 293 gpioDevicePath = "/dev" / f.path().filename(); 294 break; 295 } 296 } 297 } 298 299 if (gpioDevicePath.empty()) 300 { 301 log<level::ERR>("Could not find GPIO device path", 302 entry("BASE_PATH=%s", path.c_str())); 303 } 304 305 return gpioDevicePath; 306 } 307 308 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config) 309 { 310 311 auto type = std::get<ucd90160::extraAnalysisField>(config); 312 if (type == ucd90160::extraAnalysisType::none) 313 { 314 return false; 315 } 316 317 //Currently the only extra analysis to do is to check other GPIOs. 318 return doGPIOAnalysis(type); 319 } 320 321 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type) 322 { 323 bool errorFound = false; 324 bool shutdown = false; 325 326 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>( 327 deviceMap.find(getInstance())->second); 328 329 auto gpioConfig = analysisConfig.find(type); 330 if (gpioConfig == analysisConfig.end()) 331 { 332 return errorFound; 333 } 334 335 auto path = std::get<ucd90160::gpioDevicePathField>( 336 gpioConfig->second); 337 338 //The /dev/gpiochipX device 339 auto device = findGPIODevice(path); 340 341 //The GPIO value of the fault condition 342 auto polarity = std::get<ucd90160::gpioPolarityField>( 343 gpioConfig->second); 344 345 //The GPIOs to check 346 auto& gpios = std::get<ucd90160::gpioDefinitionField>( 347 gpioConfig->second); 348 349 for (const auto& gpio : gpios) 350 { 351 gpio::Value value; 352 353 try 354 { 355 GPIO g{device, 356 std::get<ucd90160::gpioNumField>(gpio), 357 Direction::input}; 358 359 value = g.read(); 360 } 361 catch (std::exception& e) 362 { 363 if (!gpioAccessError) 364 { 365 //GPIO only throws InternalErrors - not worth committing. 366 log<level::ERR>( 367 "GPIO read failed while analyzing a power fault", 368 entry("CHIP_PATH=%s", path.c_str())); 369 370 gpioAccessError = true; 371 } 372 continue; 373 } 374 375 if (value == polarity) 376 { 377 errorFound = true; 378 379 auto part = std::get<ucd90160::gpioCalloutField>(gpio); 380 PartCallout callout{type, part}; 381 382 if (isPartCalledOut(callout)) 383 { 384 continue; 385 } 386 387 //Look up and call the error creation function 388 auto logError = std::get<ucd90160::errorFunctionField>( 389 gpioConfig->second); 390 391 logError(*this, part); 392 393 //Save the part callout so we don't call it out again 394 setPartCallout(callout); 395 396 //Some errors (like overtemps) require a shutdown 397 auto actions = static_cast<uint32_t>( 398 std::get<ucd90160::optionFlagsField>(gpioConfig->second)); 399 400 if (actions & static_cast<decltype(actions)>( 401 ucd90160::optionFlags::shutdownOnFault)) 402 { 403 shutdown = true; 404 } 405 } 406 } 407 408 if (shutdown) 409 { 410 //Will be replaced with a GPU specific error in a future commit 411 util::powerOff<power_error::Shutdown>(bus); 412 } 413 414 return errorFound; 415 } 416 417 void UCD90160::gpuPGOODError(const std::string& callout) 418 { 419 util::NamesValues nv; 420 nv.add("STATUS_WORD", readStatusWord()); 421 nv.add("MFR_STATUS", readMFRStatus()); 422 423 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault; 424 425 report<power_error::GPUPowerFault>( 426 metadata::RAW_STATUS(nv.get().c_str()), 427 metadata::GPU(callout.c_str())); 428 } 429 430 void UCD90160::gpuOverTempError(const std::string& callout) 431 { 432 util::NamesValues nv; 433 nv.add("STATUS_WORD", readStatusWord()); 434 nv.add("MFR_STATUS", readMFRStatus()); 435 436 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp; 437 438 report<power_error::GPUOverTemp>( 439 metadata::RAW_STATUS(nv.get().c_str()), 440 metadata::GPU(callout.c_str())); 441 } 442 443 } 444 } 445