1 /**
2  * Copyright © 2017 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "ucd90160.hpp"
17 
18 #include "names_values.hpp"
19 #include "utility.hpp"
20 
21 #include <elog-errors.hpp>
22 #include <org/open_power/Witherspoon/Fault/error.hpp>
23 #include <phosphor-logging/elog.hpp>
24 #include <phosphor-logging/log.hpp>
25 #include <xyz/openbmc_project/Common/Device/error.hpp>
26 
27 #include <map>
28 #include <memory>
29 
30 namespace phosphor
31 {
32 namespace power
33 {
34 
35 using namespace std::string_literals;
36 
37 const auto MFR_STATUS = "mfr_status"s;
38 
39 const auto DEVICE_NAME = "UCD90160"s;
40 const auto DRIVER_NAME = "ucd9000"s;
41 constexpr auto NUM_PAGES = 16;
42 
43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory";
44 
45 namespace fs = std::filesystem;
46 using namespace gpio;
47 using namespace pmbus;
48 using namespace phosphor::logging;
49 
50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error;
51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error;
52 
UCD90160(size_t instance,sdbusplus::bus_t & bus)53 UCD90160::UCD90160(size_t instance, sdbusplus::bus_t& bus) :
54     Device(DEVICE_NAME, instance),
55     interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second),
56               DRIVER_NAME, instance),
57     gpioDevice(findGPIODevice(interface.path())), bus(bus)
58 {}
59 
onFailure()60 void UCD90160::onFailure()
61 {
62     try
63     {
64         auto voutError = checkVOUTFaults();
65 
66         auto pgoodError = checkPGOODFaults(false);
67 
68         // Not a voltage or PGOOD fault, but we know something
69         // failed so still create an error log.
70         if (!voutError && !pgoodError)
71         {
72             createPowerFaultLog();
73         }
74     }
75     catch (const device_error::ReadFailure& e)
76     {
77         if (!accessError)
78         {
79             commit<device_error::ReadFailure>();
80             accessError = true;
81         }
82     }
83 }
84 
analyze()85 void UCD90160::analyze()
86 {
87     try
88     {
89         // Note: Voltage faults are always fatal, so they just
90         // need to be analyzed in onFailure().
91 
92         checkPGOODFaults(true);
93     }
94     catch (const device_error::ReadFailure& e)
95     {
96         if (!accessError)
97         {
98             commit<device_error::ReadFailure>();
99             accessError = true;
100         }
101     }
102 }
103 
readStatusWord()104 uint16_t UCD90160::readStatusWord()
105 {
106     return interface.read(STATUS_WORD, Type::Debug);
107 }
108 
readMFRStatus()109 uint32_t UCD90160::readMFRStatus()
110 {
111     return interface.read(MFR_STATUS, Type::HwmonDeviceDebug);
112 }
113 
checkVOUTFaults()114 bool UCD90160::checkVOUTFaults()
115 {
116     bool errorCreated = false;
117     auto statusWord = readStatusWord();
118 
119     // The status_word register has a summary bit to tell us
120     // if each page even needs to be checked
121     if (!(statusWord & status_word::VOUT_FAULT))
122     {
123         return errorCreated;
124     }
125 
126     for (size_t page = 0; page < NUM_PAGES; page++)
127     {
128         if (isVoutFaultLogged(page))
129         {
130             continue;
131         }
132 
133         auto statusVout = interface.insertPageNum(STATUS_VOUT, page);
134         uint8_t vout = interface.read(statusVout, Type::Debug);
135 
136         // If any bits are on log them, though some are just
137         // warnings so they won't cause errors
138         if (vout)
139         {
140             log<level::INFO>("A voltage rail has bits on in STATUS_VOUT",
141                              entry("STATUS_VOUT=0x%X", vout),
142                              entry("PAGE=%d", page));
143         }
144 
145         // Log errors if any non-warning bits on
146         if (vout & ~status_vout::WARNING_MASK)
147         {
148             auto& railNames = std::get<ucd90160::railNamesField>(
149                 deviceMap.find(getInstance())->second);
150             auto railName = railNames.at(page);
151 
152             util::NamesValues nv;
153             try
154             {
155                 nv.add("STATUS_WORD", statusWord);
156                 nv.add("STATUS_VOUT", vout);
157                 nv.add("MFR_STATUS", readMFRStatus());
158             }
159             catch (const device_error::ReadFailure& e)
160             {
161                 log<level::ERR>("ReadFailure when collecting metadata");
162                 commit<device_error::ReadFailure>();
163             }
164 
165             using metadata =
166                 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault;
167 
168             report<power_error::PowerSequencerVoltageFault>(
169                 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()),
170                 metadata::RAW_STATUS(nv.get().c_str()));
171 
172             setVoutFaultLogged(page);
173             errorCreated = true;
174         }
175     }
176 
177     return errorCreated;
178 }
179 
checkPGOODFaults(bool polling)180 bool UCD90160::checkPGOODFaults(bool polling)
181 {
182     bool errorCreated = false;
183 
184     // While PGOOD faults could show up in MFR_STATUS (and we could then
185     // check the summary bit in STATUS_WORD first), they are edge triggered,
186     // and as the device driver sends a clear faults command every time we
187     // do a read, we will never see them.  So, we'll have to just read the
188     // real time GPI status GPIO.
189 
190     // Check only the GPIs configured on this system.
191     auto& gpiConfigs = std::get<ucd90160::gpiConfigField>(
192         deviceMap.find(getInstance())->second);
193 
194     for (const auto& gpiConfig : gpiConfigs)
195     {
196         auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig);
197         auto doPoll = std::get<ucd90160::pollField>(gpiConfig);
198 
199         // Can skip this one if there is already an error on this input,
200         // or we are polling and these inputs don't need to be polled
201         //(because errors on them are fatal).
202         if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll))
203         {
204             continue;
205         }
206 
207         // The real time status is read via the pin ID
208         auto pinID = std::get<ucd90160::pinIDField>(gpiConfig);
209         auto gpio = gpios.find(pinID);
210         Value gpiStatus;
211 
212         try
213         {
214             // The first time through, create the GPIO objects
215             if (gpio == gpios.end())
216             {
217                 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID,
218                                                             Direction::input));
219                 gpio = gpios.find(pinID);
220             }
221 
222             gpiStatus = gpio->second->read();
223         }
224         catch (const std::exception& e)
225         {
226             if (!accessError)
227             {
228                 log<level::ERR>(e.what());
229                 accessError = true;
230             }
231             continue;
232         }
233 
234         if (gpiStatus == Value::low)
235         {
236             // There may be some extra analysis we can do to narrow the
237             // error down further.  Note that finding an error here won't
238             // prevent us from checking this GPI again.
239             errorCreated = doExtraAnalysis(gpiConfig);
240 
241             if (errorCreated)
242             {
243                 continue;
244             }
245 
246             auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
247             auto status = (gpiStatus == Value::low) ? 0 : 1;
248 
249             util::NamesValues nv;
250 
251             try
252             {
253                 nv.add("STATUS_WORD", readStatusWord());
254                 nv.add("MFR_STATUS", readMFRStatus());
255                 nv.add("INPUT_STATUS", status);
256             }
257             catch (const device_error::ReadFailure& e)
258             {
259                 log<level::ERR>("ReadFailure when collecting metadata");
260                 commit<device_error::ReadFailure>();
261             }
262 
263             using metadata =
264                 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault;
265 
266             report<power_error::PowerSequencerPGOODFault>(
267                 metadata::INPUT_NUM(gpiNum),
268                 metadata::INPUT_NAME(gpiName.c_str()),
269                 metadata::RAW_STATUS(nv.get().c_str()));
270 
271             setPGOODFaultLogged(gpiNum);
272             errorCreated = true;
273         }
274     }
275 
276     return errorCreated;
277 }
278 
createPowerFaultLog()279 void UCD90160::createPowerFaultLog()
280 {
281     util::NamesValues nv;
282 
283     try
284     {
285         nv.add("STATUS_WORD", readStatusWord());
286         nv.add("MFR_STATUS", readMFRStatus());
287     }
288     catch (const device_error::ReadFailure& e)
289     {
290         log<level::ERR>("ReadFailure when collecting metadata");
291         commit<device_error::ReadFailure>();
292     }
293 
294     using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault;
295 
296     report<power_error::PowerSequencerFault>(
297         metadata::RAW_STATUS(nv.get().c_str()));
298 }
299 
findGPIODevice(const fs::path & path)300 fs::path UCD90160::findGPIODevice(const fs::path& path)
301 {
302     fs::path gpioDevicePath;
303 
304     // In the driver directory, look for a subdirectory
305     // named gpiochipX, where X is some number.  Then
306     // we'll access the GPIO at /dev/gpiochipX.
307     if (fs::is_directory(path))
308     {
309         for (auto& f : fs::directory_iterator(path))
310         {
311             if (f.path().filename().string().find("gpiochip") !=
312                 std::string::npos)
313             {
314                 gpioDevicePath = "/dev" / f.path().filename();
315                 break;
316             }
317         }
318     }
319 
320     if (gpioDevicePath.empty())
321     {
322         log<level::ERR>("Could not find GPIO device path",
323                         entry("BASE_PATH=%s", path.c_str()));
324     }
325 
326     return gpioDevicePath;
327 }
328 
doExtraAnalysis(const ucd90160::GPIConfig & config)329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
330 {
331     auto type = std::get<ucd90160::extraAnalysisField>(config);
332     if (type == ucd90160::extraAnalysisType::none)
333     {
334         return false;
335     }
336 
337     // Currently the only extra analysis to do is to check other GPIOs.
338     return doGPIOAnalysis(type);
339 }
340 
doGPIOAnalysis(ucd90160::extraAnalysisType type)341 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
342 {
343     bool errorFound = false;
344     bool shutdown = false;
345 
346     const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
347         deviceMap.find(getInstance())->second);
348 
349     auto gpioConfig = analysisConfig.find(type);
350     if (gpioConfig == analysisConfig.end())
351     {
352         return errorFound;
353     }
354 
355     auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second);
356 
357     // The /dev/gpiochipX device
358     auto device = findGPIODevice(path);
359 
360     if (device.empty())
361     {
362         log<level::ERR>(
363             "Missing GPIO device - cannot do GPIO analysis of fault",
364             entry("ANALYSIS_TYPE=%d\n", type));
365         return errorFound;
366     }
367 
368     // The GPIO value of the fault condition
369     auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second);
370 
371     // The GPIOs to check
372     auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second);
373 
374     for (const auto& gpio : gpios)
375     {
376         gpio::Value value;
377 
378         try
379         {
380             GPIO g{device, std::get<ucd90160::gpioNumField>(gpio),
381                    Direction::input};
382 
383             value = g.read();
384         }
385         catch (const std::exception& e)
386         {
387             if (!gpioAccessError)
388             {
389                 // GPIO only throws InternalErrors - not worth committing.
390                 log<level::ERR>(
391                     "GPIO read failed while analyzing a power fault",
392                     entry("CHIP_PATH=%s", path.c_str()));
393 
394                 gpioAccessError = true;
395             }
396             continue;
397         }
398 
399         if (value == polarity)
400         {
401             errorFound = true;
402 
403             std::string part{INVENTORY_OBJ_PATH};
404             part = part + std::get<ucd90160::gpioCalloutField>(gpio);
405             PartCallout callout{type, part};
406 
407             if (isPartCalledOut(callout))
408             {
409                 continue;
410             }
411 
412             // Look up and call the error creation function
413             auto logError =
414                 std::get<ucd90160::errorFunctionField>(gpioConfig->second);
415 
416             logError(*this, part);
417 
418             // Save the part callout so we don't call it out again
419             setPartCallout(callout);
420 
421             // Some errors (like overtemps) require a shutdown
422             auto actions = static_cast<uint32_t>(
423                 std::get<ucd90160::optionFlagsField>(gpioConfig->second));
424 
425             if (actions & static_cast<decltype(actions)>(
426                               ucd90160::optionFlags::shutdownOnFault))
427             {
428                 shutdown = true;
429             }
430         }
431     }
432 
433     if (shutdown)
434     {
435         // Will be replaced with a GPU specific error in a future commit
436         util::powerOff<power_error::Shutdown>(bus);
437     }
438 
439     return errorFound;
440 }
441 
gpuPGOODError(const std::string & callout)442 void UCD90160::gpuPGOODError(const std::string& callout)
443 {
444     util::NamesValues nv;
445 
446     try
447     {
448         nv.add("STATUS_WORD", readStatusWord());
449         nv.add("MFR_STATUS", readMFRStatus());
450     }
451     catch (const device_error::ReadFailure& e)
452     {
453         log<level::ERR>("ReadFailure when collecting metadata");
454         commit<device_error::ReadFailure>();
455     }
456 
457     using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault;
458 
459     report<power_error::GPUPowerFault>(
460         metadata::RAW_STATUS(nv.get().c_str()),
461         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
462 }
463 
gpuOverTempError(const std::string & callout)464 void UCD90160::gpuOverTempError(const std::string& callout)
465 {
466     util::NamesValues nv;
467 
468     try
469     {
470         nv.add("STATUS_WORD", readStatusWord());
471         nv.add("MFR_STATUS", readMFRStatus());
472     }
473     catch (const device_error::ReadFailure& e)
474     {
475         log<level::ERR>("ReadFailure when collecting metadata");
476         commit<device_error::ReadFailure>();
477     }
478 
479     using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp;
480 
481     report<power_error::GPUOverTemp>(
482         metadata::RAW_STATUS(nv.get().c_str()),
483         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
484 }
485 
memGoodError(const std::string & callout)486 void UCD90160::memGoodError(const std::string& callout)
487 {
488     util::NamesValues nv;
489 
490     try
491     {
492         nv.add("STATUS_WORD", readStatusWord());
493         nv.add("MFR_STATUS", readMFRStatus());
494     }
495     catch (const device_error::ReadFailure& e)
496     {
497         log<level::ERR>("ReadFailure when collecting metadata");
498         commit<device_error::ReadFailure>();
499     }
500 
501     using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault;
502 
503     report<power_error::MemoryPowerFault>(
504         metadata::RAW_STATUS(nv.get().c_str()),
505         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
506 }
507 
508 } // namespace power
509 } // namespace phosphor
510