1 /**
2 * Copyright © 2017 IBM Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "ucd90160.hpp"
17
18 #include "names_values.hpp"
19 #include "utility.hpp"
20
21 #include <elog-errors.hpp>
22 #include <org/open_power/Witherspoon/Fault/error.hpp>
23 #include <phosphor-logging/elog.hpp>
24 #include <phosphor-logging/log.hpp>
25 #include <xyz/openbmc_project/Common/Device/error.hpp>
26
27 #include <map>
28 #include <memory>
29
30 namespace phosphor
31 {
32 namespace power
33 {
34
35 using namespace std::string_literals;
36
37 const auto MFR_STATUS = "mfr_status"s;
38
39 const auto DEVICE_NAME = "UCD90160"s;
40 const auto DRIVER_NAME = "ucd9000"s;
41 constexpr auto NUM_PAGES = 16;
42
43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory";
44
45 namespace fs = std::filesystem;
46 using namespace gpio;
47 using namespace pmbus;
48 using namespace phosphor::logging;
49
50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error;
51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error;
52
UCD90160(size_t instance,sdbusplus::bus_t & bus)53 UCD90160::UCD90160(size_t instance, sdbusplus::bus_t& bus) :
54 Device(DEVICE_NAME, instance),
55 interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second),
56 DRIVER_NAME, instance),
57 gpioDevice(findGPIODevice(interface.path())), bus(bus)
58 {}
59
onFailure()60 void UCD90160::onFailure()
61 {
62 try
63 {
64 auto voutError = checkVOUTFaults();
65
66 auto pgoodError = checkPGOODFaults(false);
67
68 // Not a voltage or PGOOD fault, but we know something
69 // failed so still create an error log.
70 if (!voutError && !pgoodError)
71 {
72 createPowerFaultLog();
73 }
74 }
75 catch (const device_error::ReadFailure& e)
76 {
77 if (!accessError)
78 {
79 commit<device_error::ReadFailure>();
80 accessError = true;
81 }
82 }
83 }
84
analyze()85 void UCD90160::analyze()
86 {
87 try
88 {
89 // Note: Voltage faults are always fatal, so they just
90 // need to be analyzed in onFailure().
91
92 checkPGOODFaults(true);
93 }
94 catch (const device_error::ReadFailure& e)
95 {
96 if (!accessError)
97 {
98 commit<device_error::ReadFailure>();
99 accessError = true;
100 }
101 }
102 }
103
readStatusWord()104 uint16_t UCD90160::readStatusWord()
105 {
106 return interface.read(STATUS_WORD, Type::Debug);
107 }
108
readMFRStatus()109 uint32_t UCD90160::readMFRStatus()
110 {
111 return interface.read(MFR_STATUS, Type::HwmonDeviceDebug);
112 }
113
checkVOUTFaults()114 bool UCD90160::checkVOUTFaults()
115 {
116 bool errorCreated = false;
117 auto statusWord = readStatusWord();
118
119 // The status_word register has a summary bit to tell us
120 // if each page even needs to be checked
121 if (!(statusWord & status_word::VOUT_FAULT))
122 {
123 return errorCreated;
124 }
125
126 for (size_t page = 0; page < NUM_PAGES; page++)
127 {
128 if (isVoutFaultLogged(page))
129 {
130 continue;
131 }
132
133 auto statusVout = interface.insertPageNum(STATUS_VOUT, page);
134 uint8_t vout = interface.read(statusVout, Type::Debug);
135
136 // If any bits are on log them, though some are just
137 // warnings so they won't cause errors
138 if (vout)
139 {
140 log<level::INFO>("A voltage rail has bits on in STATUS_VOUT",
141 entry("STATUS_VOUT=0x%X", vout),
142 entry("PAGE=%d", page));
143 }
144
145 // Log errors if any non-warning bits on
146 if (vout & ~status_vout::WARNING_MASK)
147 {
148 auto& railNames = std::get<ucd90160::railNamesField>(
149 deviceMap.find(getInstance())->second);
150 auto railName = railNames.at(page);
151
152 util::NamesValues nv;
153 try
154 {
155 nv.add("STATUS_WORD", statusWord);
156 nv.add("STATUS_VOUT", vout);
157 nv.add("MFR_STATUS", readMFRStatus());
158 }
159 catch (const device_error::ReadFailure& e)
160 {
161 log<level::ERR>("ReadFailure when collecting metadata");
162 commit<device_error::ReadFailure>();
163 }
164
165 using metadata =
166 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault;
167
168 report<power_error::PowerSequencerVoltageFault>(
169 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()),
170 metadata::RAW_STATUS(nv.get().c_str()));
171
172 setVoutFaultLogged(page);
173 errorCreated = true;
174 }
175 }
176
177 return errorCreated;
178 }
179
checkPGOODFaults(bool polling)180 bool UCD90160::checkPGOODFaults(bool polling)
181 {
182 bool errorCreated = false;
183
184 // While PGOOD faults could show up in MFR_STATUS (and we could then
185 // check the summary bit in STATUS_WORD first), they are edge triggered,
186 // and as the device driver sends a clear faults command every time we
187 // do a read, we will never see them. So, we'll have to just read the
188 // real time GPI status GPIO.
189
190 // Check only the GPIs configured on this system.
191 auto& gpiConfigs = std::get<ucd90160::gpiConfigField>(
192 deviceMap.find(getInstance())->second);
193
194 for (const auto& gpiConfig : gpiConfigs)
195 {
196 auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig);
197 auto doPoll = std::get<ucd90160::pollField>(gpiConfig);
198
199 // Can skip this one if there is already an error on this input,
200 // or we are polling and these inputs don't need to be polled
201 //(because errors on them are fatal).
202 if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll))
203 {
204 continue;
205 }
206
207 // The real time status is read via the pin ID
208 auto pinID = std::get<ucd90160::pinIDField>(gpiConfig);
209 auto gpio = gpios.find(pinID);
210 Value gpiStatus;
211
212 try
213 {
214 // The first time through, create the GPIO objects
215 if (gpio == gpios.end())
216 {
217 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID,
218 Direction::input));
219 gpio = gpios.find(pinID);
220 }
221
222 gpiStatus = gpio->second->read();
223 }
224 catch (const std::exception& e)
225 {
226 if (!accessError)
227 {
228 log<level::ERR>(e.what());
229 accessError = true;
230 }
231 continue;
232 }
233
234 if (gpiStatus == Value::low)
235 {
236 // There may be some extra analysis we can do to narrow the
237 // error down further. Note that finding an error here won't
238 // prevent us from checking this GPI again.
239 errorCreated = doExtraAnalysis(gpiConfig);
240
241 if (errorCreated)
242 {
243 continue;
244 }
245
246 auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
247 auto status = (gpiStatus == Value::low) ? 0 : 1;
248
249 util::NamesValues nv;
250
251 try
252 {
253 nv.add("STATUS_WORD", readStatusWord());
254 nv.add("MFR_STATUS", readMFRStatus());
255 nv.add("INPUT_STATUS", status);
256 }
257 catch (const device_error::ReadFailure& e)
258 {
259 log<level::ERR>("ReadFailure when collecting metadata");
260 commit<device_error::ReadFailure>();
261 }
262
263 using metadata =
264 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault;
265
266 report<power_error::PowerSequencerPGOODFault>(
267 metadata::INPUT_NUM(gpiNum),
268 metadata::INPUT_NAME(gpiName.c_str()),
269 metadata::RAW_STATUS(nv.get().c_str()));
270
271 setPGOODFaultLogged(gpiNum);
272 errorCreated = true;
273 }
274 }
275
276 return errorCreated;
277 }
278
createPowerFaultLog()279 void UCD90160::createPowerFaultLog()
280 {
281 util::NamesValues nv;
282
283 try
284 {
285 nv.add("STATUS_WORD", readStatusWord());
286 nv.add("MFR_STATUS", readMFRStatus());
287 }
288 catch (const device_error::ReadFailure& e)
289 {
290 log<level::ERR>("ReadFailure when collecting metadata");
291 commit<device_error::ReadFailure>();
292 }
293
294 using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault;
295
296 report<power_error::PowerSequencerFault>(
297 metadata::RAW_STATUS(nv.get().c_str()));
298 }
299
findGPIODevice(const fs::path & path)300 fs::path UCD90160::findGPIODevice(const fs::path& path)
301 {
302 fs::path gpioDevicePath;
303
304 // In the driver directory, look for a subdirectory
305 // named gpiochipX, where X is some number. Then
306 // we'll access the GPIO at /dev/gpiochipX.
307 if (fs::is_directory(path))
308 {
309 for (auto& f : fs::directory_iterator(path))
310 {
311 if (f.path().filename().string().find("gpiochip") !=
312 std::string::npos)
313 {
314 gpioDevicePath = "/dev" / f.path().filename();
315 break;
316 }
317 }
318 }
319
320 if (gpioDevicePath.empty())
321 {
322 log<level::ERR>("Could not find GPIO device path",
323 entry("BASE_PATH=%s", path.c_str()));
324 }
325
326 return gpioDevicePath;
327 }
328
doExtraAnalysis(const ucd90160::GPIConfig & config)329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
330 {
331 auto type = std::get<ucd90160::extraAnalysisField>(config);
332 if (type == ucd90160::extraAnalysisType::none)
333 {
334 return false;
335 }
336
337 // Currently the only extra analysis to do is to check other GPIOs.
338 return doGPIOAnalysis(type);
339 }
340
doGPIOAnalysis(ucd90160::extraAnalysisType type)341 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
342 {
343 bool errorFound = false;
344 bool shutdown = false;
345
346 const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
347 deviceMap.find(getInstance())->second);
348
349 auto gpioConfig = analysisConfig.find(type);
350 if (gpioConfig == analysisConfig.end())
351 {
352 return errorFound;
353 }
354
355 auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second);
356
357 // The /dev/gpiochipX device
358 auto device = findGPIODevice(path);
359
360 if (device.empty())
361 {
362 log<level::ERR>(
363 "Missing GPIO device - cannot do GPIO analysis of fault",
364 entry("ANALYSIS_TYPE=%d\n", type));
365 return errorFound;
366 }
367
368 // The GPIO value of the fault condition
369 auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second);
370
371 // The GPIOs to check
372 auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second);
373
374 for (const auto& gpio : gpios)
375 {
376 gpio::Value value;
377
378 try
379 {
380 GPIO g{device, std::get<ucd90160::gpioNumField>(gpio),
381 Direction::input};
382
383 value = g.read();
384 }
385 catch (const std::exception& e)
386 {
387 if (!gpioAccessError)
388 {
389 // GPIO only throws InternalErrors - not worth committing.
390 log<level::ERR>(
391 "GPIO read failed while analyzing a power fault",
392 entry("CHIP_PATH=%s", path.c_str()));
393
394 gpioAccessError = true;
395 }
396 continue;
397 }
398
399 if (value == polarity)
400 {
401 errorFound = true;
402
403 std::string part{INVENTORY_OBJ_PATH};
404 part = part + std::get<ucd90160::gpioCalloutField>(gpio);
405 PartCallout callout{type, part};
406
407 if (isPartCalledOut(callout))
408 {
409 continue;
410 }
411
412 // Look up and call the error creation function
413 auto logError =
414 std::get<ucd90160::errorFunctionField>(gpioConfig->second);
415
416 logError(*this, part);
417
418 // Save the part callout so we don't call it out again
419 setPartCallout(callout);
420
421 // Some errors (like overtemps) require a shutdown
422 auto actions = static_cast<uint32_t>(
423 std::get<ucd90160::optionFlagsField>(gpioConfig->second));
424
425 if (actions & static_cast<decltype(actions)>(
426 ucd90160::optionFlags::shutdownOnFault))
427 {
428 shutdown = true;
429 }
430 }
431 }
432
433 if (shutdown)
434 {
435 // Will be replaced with a GPU specific error in a future commit
436 util::powerOff<power_error::Shutdown>(bus);
437 }
438
439 return errorFound;
440 }
441
gpuPGOODError(const std::string & callout)442 void UCD90160::gpuPGOODError(const std::string& callout)
443 {
444 util::NamesValues nv;
445
446 try
447 {
448 nv.add("STATUS_WORD", readStatusWord());
449 nv.add("MFR_STATUS", readMFRStatus());
450 }
451 catch (const device_error::ReadFailure& e)
452 {
453 log<level::ERR>("ReadFailure when collecting metadata");
454 commit<device_error::ReadFailure>();
455 }
456
457 using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault;
458
459 report<power_error::GPUPowerFault>(
460 metadata::RAW_STATUS(nv.get().c_str()),
461 metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
462 }
463
gpuOverTempError(const std::string & callout)464 void UCD90160::gpuOverTempError(const std::string& callout)
465 {
466 util::NamesValues nv;
467
468 try
469 {
470 nv.add("STATUS_WORD", readStatusWord());
471 nv.add("MFR_STATUS", readMFRStatus());
472 }
473 catch (const device_error::ReadFailure& e)
474 {
475 log<level::ERR>("ReadFailure when collecting metadata");
476 commit<device_error::ReadFailure>();
477 }
478
479 using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp;
480
481 report<power_error::GPUOverTemp>(
482 metadata::RAW_STATUS(nv.get().c_str()),
483 metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
484 }
485
memGoodError(const std::string & callout)486 void UCD90160::memGoodError(const std::string& callout)
487 {
488 util::NamesValues nv;
489
490 try
491 {
492 nv.add("STATUS_WORD", readStatusWord());
493 nv.add("MFR_STATUS", readMFRStatus());
494 }
495 catch (const device_error::ReadFailure& e)
496 {
497 log<level::ERR>("ReadFailure when collecting metadata");
498 commit<device_error::ReadFailure>();
499 }
500
501 using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault;
502
503 report<power_error::MemoryPowerFault>(
504 metadata::RAW_STATUS(nv.get().c_str()),
505 metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
506 }
507
508 } // namespace power
509 } // namespace phosphor
510