1 /**
2  * Copyright © 2017 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "ucd90160.hpp"
17 
18 #include "names_values.hpp"
19 #include "utility.hpp"
20 
21 #include <elog-errors.hpp>
22 #include <org/open_power/Witherspoon/Fault/error.hpp>
23 #include <phosphor-logging/elog.hpp>
24 #include <phosphor-logging/log.hpp>
25 #include <xyz/openbmc_project/Common/Device/error.hpp>
26 
27 #include <map>
28 #include <memory>
29 
30 namespace phosphor
31 {
32 namespace power
33 {
34 
35 using namespace std::string_literals;
36 
37 const auto MFR_STATUS = "mfr_status"s;
38 
39 const auto DEVICE_NAME = "UCD90160"s;
40 const auto DRIVER_NAME = "ucd9000"s;
41 constexpr auto NUM_PAGES = 16;
42 
43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory";
44 
45 namespace fs = std::filesystem;
46 using namespace gpio;
47 using namespace pmbus;
48 using namespace phosphor::logging;
49 
50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error;
51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error;
52 
53 UCD90160::UCD90160(size_t instance, sdbusplus::bus::bus& bus) :
54     Device(DEVICE_NAME, instance),
55     interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second),
56               DRIVER_NAME, instance),
57     gpioDevice(findGPIODevice(interface.path())), bus(bus)
58 {
59 }
60 
61 void UCD90160::onFailure()
62 {
63     try
64     {
65         auto voutError = checkVOUTFaults();
66 
67         auto pgoodError = checkPGOODFaults(false);
68 
69         // Not a voltage or PGOOD fault, but we know something
70         // failed so still create an error log.
71         if (!voutError && !pgoodError)
72         {
73             createPowerFaultLog();
74         }
75     }
76     catch (device_error::ReadFailure& e)
77     {
78         if (!accessError)
79         {
80             commit<device_error::ReadFailure>();
81             accessError = true;
82         }
83     }
84 }
85 
86 void UCD90160::analyze()
87 {
88     try
89     {
90         // Note: Voltage faults are always fatal, so they just
91         // need to be analyzed in onFailure().
92 
93         checkPGOODFaults(true);
94     }
95     catch (device_error::ReadFailure& e)
96     {
97         if (!accessError)
98         {
99             commit<device_error::ReadFailure>();
100             accessError = true;
101         }
102     }
103 }
104 
105 uint16_t UCD90160::readStatusWord()
106 {
107     return interface.read(STATUS_WORD, Type::Debug);
108 }
109 
110 uint32_t UCD90160::readMFRStatus()
111 {
112     return interface.read(MFR_STATUS, Type::HwmonDeviceDebug);
113 }
114 
115 bool UCD90160::checkVOUTFaults()
116 {
117     bool errorCreated = false;
118     auto statusWord = readStatusWord();
119 
120     // The status_word register has a summary bit to tell us
121     // if each page even needs to be checked
122     if (!(statusWord & status_word::VOUT_FAULT))
123     {
124         return errorCreated;
125     }
126 
127     for (size_t page = 0; page < NUM_PAGES; page++)
128     {
129         if (isVoutFaultLogged(page))
130         {
131             continue;
132         }
133 
134         auto statusVout = interface.insertPageNum(STATUS_VOUT, page);
135         uint8_t vout = interface.read(statusVout, Type::Debug);
136 
137         // If any bits are on log them, though some are just
138         // warnings so they won't cause errors
139         if (vout)
140         {
141             log<level::INFO>("A voltage rail has bits on in STATUS_VOUT",
142                              entry("STATUS_VOUT=0x%X", vout),
143                              entry("PAGE=%d", page));
144         }
145 
146         // Log errors if any non-warning bits on
147         if (vout & ~status_vout::WARNING_MASK)
148         {
149             auto& railNames = std::get<ucd90160::railNamesField>(
150                 deviceMap.find(getInstance())->second);
151             auto railName = railNames.at(page);
152 
153             util::NamesValues nv;
154             try
155             {
156                 nv.add("STATUS_WORD", statusWord);
157                 nv.add("STATUS_VOUT", vout);
158                 nv.add("MFR_STATUS", readMFRStatus());
159             }
160             catch (device_error::ReadFailure& e)
161             {
162                 log<level::ERR>("ReadFailure when collecting metadata");
163                 commit<device_error::ReadFailure>();
164             }
165 
166             using metadata =
167                 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault;
168 
169             report<power_error::PowerSequencerVoltageFault>(
170                 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()),
171                 metadata::RAW_STATUS(nv.get().c_str()));
172 
173             setVoutFaultLogged(page);
174             errorCreated = true;
175         }
176     }
177 
178     return errorCreated;
179 }
180 
181 bool UCD90160::checkPGOODFaults(bool polling)
182 {
183     bool errorCreated = false;
184 
185     // While PGOOD faults could show up in MFR_STATUS (and we could then
186     // check the summary bit in STATUS_WORD first), they are edge triggered,
187     // and as the device driver sends a clear faults command every time we
188     // do a read, we will never see them.  So, we'll have to just read the
189     // real time GPI status GPIO.
190 
191     // Check only the GPIs configured on this system.
192     auto& gpiConfigs = std::get<ucd90160::gpiConfigField>(
193         deviceMap.find(getInstance())->second);
194 
195     for (const auto& gpiConfig : gpiConfigs)
196     {
197         auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig);
198         auto doPoll = std::get<ucd90160::pollField>(gpiConfig);
199 
200         // Can skip this one if there is already an error on this input,
201         // or we are polling and these inputs don't need to be polled
202         //(because errors on them are fatal).
203         if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll))
204         {
205             continue;
206         }
207 
208         // The real time status is read via the pin ID
209         auto pinID = std::get<ucd90160::pinIDField>(gpiConfig);
210         auto gpio = gpios.find(pinID);
211         Value gpiStatus;
212 
213         try
214         {
215             // The first time through, create the GPIO objects
216             if (gpio == gpios.end())
217             {
218                 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID,
219                                                             Direction::input));
220                 gpio = gpios.find(pinID);
221             }
222 
223             gpiStatus = gpio->second->read();
224         }
225         catch (std::exception& e)
226         {
227             if (!accessError)
228             {
229                 log<level::ERR>(e.what());
230                 accessError = true;
231             }
232             continue;
233         }
234 
235         if (gpiStatus == Value::low)
236         {
237             // There may be some extra analysis we can do to narrow the
238             // error down further.  Note that finding an error here won't
239             // prevent us from checking this GPI again.
240             errorCreated = doExtraAnalysis(gpiConfig);
241 
242             if (errorCreated)
243             {
244                 continue;
245             }
246 
247             auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
248             auto status = (gpiStatus == Value::low) ? 0 : 1;
249 
250             util::NamesValues nv;
251 
252             try
253             {
254                 nv.add("STATUS_WORD", readStatusWord());
255                 nv.add("MFR_STATUS", readMFRStatus());
256                 nv.add("INPUT_STATUS", status);
257             }
258             catch (device_error::ReadFailure& e)
259             {
260                 log<level::ERR>("ReadFailure when collecting metadata");
261                 commit<device_error::ReadFailure>();
262             }
263 
264             using metadata =
265                 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault;
266 
267             report<power_error::PowerSequencerPGOODFault>(
268                 metadata::INPUT_NUM(gpiNum),
269                 metadata::INPUT_NAME(gpiName.c_str()),
270                 metadata::RAW_STATUS(nv.get().c_str()));
271 
272             setPGOODFaultLogged(gpiNum);
273             errorCreated = true;
274         }
275     }
276 
277     return errorCreated;
278 }
279 
280 void UCD90160::createPowerFaultLog()
281 {
282     util::NamesValues nv;
283 
284     try
285     {
286         nv.add("STATUS_WORD", readStatusWord());
287         nv.add("MFR_STATUS", readMFRStatus());
288     }
289     catch (device_error::ReadFailure& e)
290     {
291         log<level::ERR>("ReadFailure when collecting metadata");
292         commit<device_error::ReadFailure>();
293     }
294 
295     using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault;
296 
297     report<power_error::PowerSequencerFault>(
298         metadata::RAW_STATUS(nv.get().c_str()));
299 }
300 
301 fs::path UCD90160::findGPIODevice(const fs::path& path)
302 {
303     fs::path gpioDevicePath;
304 
305     // In the driver directory, look for a subdirectory
306     // named gpiochipX, where X is some number.  Then
307     // we'll access the GPIO at /dev/gpiochipX.
308     if (fs::is_directory(path))
309     {
310         for (auto& f : fs::directory_iterator(path))
311         {
312             if (f.path().filename().string().find("gpiochip") !=
313                 std::string::npos)
314             {
315                 gpioDevicePath = "/dev" / f.path().filename();
316                 break;
317             }
318         }
319     }
320 
321     if (gpioDevicePath.empty())
322     {
323         log<level::ERR>("Could not find GPIO device path",
324                         entry("BASE_PATH=%s", path.c_str()));
325     }
326 
327     return gpioDevicePath;
328 }
329 
330 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
331 {
332 
333     auto type = std::get<ucd90160::extraAnalysisField>(config);
334     if (type == ucd90160::extraAnalysisType::none)
335     {
336         return false;
337     }
338 
339     // Currently the only extra analysis to do is to check other GPIOs.
340     return doGPIOAnalysis(type);
341 }
342 
343 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
344 {
345     bool errorFound = false;
346     bool shutdown = false;
347 
348     const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
349         deviceMap.find(getInstance())->second);
350 
351     auto gpioConfig = analysisConfig.find(type);
352     if (gpioConfig == analysisConfig.end())
353     {
354         return errorFound;
355     }
356 
357     auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second);
358 
359     // The /dev/gpiochipX device
360     auto device = findGPIODevice(path);
361 
362     if (device.empty())
363     {
364         log<level::ERR>(
365             "Missing GPIO device - cannot do GPIO analysis of fault",
366             entry("ANALYSIS_TYPE=%d\n", type));
367         return errorFound;
368     }
369 
370     // The GPIO value of the fault condition
371     auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second);
372 
373     // The GPIOs to check
374     auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second);
375 
376     for (const auto& gpio : gpios)
377     {
378         gpio::Value value;
379 
380         try
381         {
382             GPIO g{device, std::get<ucd90160::gpioNumField>(gpio),
383                    Direction::input};
384 
385             value = g.read();
386         }
387         catch (std::exception& e)
388         {
389             if (!gpioAccessError)
390             {
391                 // GPIO only throws InternalErrors - not worth committing.
392                 log<level::ERR>(
393                     "GPIO read failed while analyzing a power fault",
394                     entry("CHIP_PATH=%s", path.c_str()));
395 
396                 gpioAccessError = true;
397             }
398             continue;
399         }
400 
401         if (value == polarity)
402         {
403             errorFound = true;
404 
405             std::string part{INVENTORY_OBJ_PATH};
406             part = part + std::get<ucd90160::gpioCalloutField>(gpio);
407             PartCallout callout{type, part};
408 
409             if (isPartCalledOut(callout))
410             {
411                 continue;
412             }
413 
414             // Look up and call the error creation function
415             auto logError =
416                 std::get<ucd90160::errorFunctionField>(gpioConfig->second);
417 
418             logError(*this, part);
419 
420             // Save the part callout so we don't call it out again
421             setPartCallout(callout);
422 
423             // Some errors (like overtemps) require a shutdown
424             auto actions = static_cast<uint32_t>(
425                 std::get<ucd90160::optionFlagsField>(gpioConfig->second));
426 
427             if (actions & static_cast<decltype(actions)>(
428                               ucd90160::optionFlags::shutdownOnFault))
429             {
430                 shutdown = true;
431             }
432         }
433     }
434 
435     if (shutdown)
436     {
437         // Will be replaced with a GPU specific error in a future commit
438         util::powerOff<power_error::Shutdown>(bus);
439     }
440 
441     return errorFound;
442 }
443 
444 void UCD90160::gpuPGOODError(const std::string& callout)
445 {
446     util::NamesValues nv;
447 
448     try
449     {
450         nv.add("STATUS_WORD", readStatusWord());
451         nv.add("MFR_STATUS", readMFRStatus());
452     }
453     catch (device_error::ReadFailure& e)
454     {
455         log<level::ERR>("ReadFailure when collecting metadata");
456         commit<device_error::ReadFailure>();
457     }
458 
459     using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault;
460 
461     report<power_error::GPUPowerFault>(
462         metadata::RAW_STATUS(nv.get().c_str()),
463         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
464 }
465 
466 void UCD90160::gpuOverTempError(const std::string& callout)
467 {
468     util::NamesValues nv;
469 
470     try
471     {
472         nv.add("STATUS_WORD", readStatusWord());
473         nv.add("MFR_STATUS", readMFRStatus());
474     }
475     catch (device_error::ReadFailure& e)
476     {
477         log<level::ERR>("ReadFailure when collecting metadata");
478         commit<device_error::ReadFailure>();
479     }
480 
481     using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp;
482 
483     report<power_error::GPUOverTemp>(
484         metadata::RAW_STATUS(nv.get().c_str()),
485         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
486 }
487 
488 void UCD90160::memGoodError(const std::string& callout)
489 {
490     util::NamesValues nv;
491 
492     try
493     {
494         nv.add("STATUS_WORD", readStatusWord());
495         nv.add("MFR_STATUS", readMFRStatus());
496     }
497     catch (device_error::ReadFailure& e)
498     {
499         log<level::ERR>("ReadFailure when collecting metadata");
500         commit<device_error::ReadFailure>();
501     }
502 
503     using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault;
504 
505     report<power_error::MemoryPowerFault>(
506         metadata::RAW_STATUS(nv.get().c_str()),
507         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
508 }
509 
510 } // namespace power
511 } // namespace phosphor
512