1 /**
2  * Copyright © 2017 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "ucd90160.hpp"
17 
18 #include "names_values.hpp"
19 #include "utility.hpp"
20 
21 #include <elog-errors.hpp>
22 #include <map>
23 #include <memory>
24 #include <org/open_power/Witherspoon/Fault/error.hpp>
25 #include <phosphor-logging/elog.hpp>
26 #include <phosphor-logging/log.hpp>
27 #include <xyz/openbmc_project/Common/Device/error.hpp>
28 
29 namespace witherspoon
30 {
31 namespace power
32 {
33 
34 using namespace std::string_literals;
35 
36 const auto MFR_STATUS = "mfr_status"s;
37 
38 const auto DEVICE_NAME = "UCD90160"s;
39 const auto DRIVER_NAME = "ucd9000"s;
40 constexpr auto NUM_PAGES = 16;
41 
42 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory";
43 
44 namespace fs = std::experimental::filesystem;
45 using namespace gpio;
46 using namespace pmbus;
47 using namespace phosphor::logging;
48 
49 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error;
50 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error;
51 
52 UCD90160::UCD90160(size_t instance, sdbusplus::bus::bus& bus) :
53     Device(DEVICE_NAME, instance),
54     interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second),
55               DRIVER_NAME, instance),
56     gpioDevice(findGPIODevice(interface.path())), bus(bus)
57 {
58 }
59 
60 void UCD90160::onFailure()
61 {
62     try
63     {
64         auto voutError = checkVOUTFaults();
65 
66         auto pgoodError = checkPGOODFaults(false);
67 
68         // Not a voltage or PGOOD fault, but we know something
69         // failed so still create an error log.
70         if (!voutError && !pgoodError)
71         {
72             createPowerFaultLog();
73         }
74     }
75     catch (device_error::ReadFailure& e)
76     {
77         if (!accessError)
78         {
79             commit<device_error::ReadFailure>();
80             accessError = true;
81         }
82     }
83 }
84 
85 void UCD90160::analyze()
86 {
87     try
88     {
89         // Note: Voltage faults are always fatal, so they just
90         // need to be analyzed in onFailure().
91 
92         checkPGOODFaults(true);
93     }
94     catch (device_error::ReadFailure& e)
95     {
96         if (!accessError)
97         {
98             commit<device_error::ReadFailure>();
99             accessError = true;
100         }
101     }
102 }
103 
104 uint16_t UCD90160::readStatusWord()
105 {
106     return interface.read(STATUS_WORD, Type::Debug);
107 }
108 
109 uint32_t UCD90160::readMFRStatus()
110 {
111     return interface.read(MFR_STATUS, Type::HwmonDeviceDebug);
112 }
113 
114 bool UCD90160::checkVOUTFaults()
115 {
116     bool errorCreated = false;
117     auto statusWord = readStatusWord();
118 
119     // The status_word register has a summary bit to tell us
120     // if each page even needs to be checked
121     if (!(statusWord & status_word::VOUT_FAULT))
122     {
123         return errorCreated;
124     }
125 
126     for (size_t page = 0; page < NUM_PAGES; page++)
127     {
128         if (isVoutFaultLogged(page))
129         {
130             continue;
131         }
132 
133         auto statusVout = interface.insertPageNum(STATUS_VOUT, page);
134         uint8_t vout = interface.read(statusVout, Type::Debug);
135 
136         // If any bits are on log them, though some are just
137         // warnings so they won't cause errors
138         if (vout)
139         {
140             log<level::INFO>("A voltage rail has bits on in STATUS_VOUT",
141                              entry("STATUS_VOUT=0x%X", vout),
142                              entry("PAGE=%d", page));
143         }
144 
145         // Log errors if any non-warning bits on
146         if (vout & ~status_vout::WARNING_MASK)
147         {
148             auto& railNames = std::get<ucd90160::railNamesField>(
149                 deviceMap.find(getInstance())->second);
150             auto railName = railNames.at(page);
151 
152             util::NamesValues nv;
153             try
154             {
155                 nv.add("STATUS_WORD", statusWord);
156                 nv.add("STATUS_VOUT", vout);
157                 nv.add("MFR_STATUS", readMFRStatus());
158             }
159             catch (device_error::ReadFailure& e)
160             {
161                 log<level::ERR>("ReadFailure when collecting metadata");
162                 commit<device_error::ReadFailure>();
163             }
164 
165             using metadata =
166                 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault;
167 
168             report<power_error::PowerSequencerVoltageFault>(
169                 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()),
170                 metadata::RAW_STATUS(nv.get().c_str()));
171 
172             setVoutFaultLogged(page);
173             errorCreated = true;
174         }
175     }
176 
177     return errorCreated;
178 }
179 
180 bool UCD90160::checkPGOODFaults(bool polling)
181 {
182     bool errorCreated = false;
183 
184     // While PGOOD faults could show up in MFR_STATUS (and we could then
185     // check the summary bit in STATUS_WORD first), they are edge triggered,
186     // and as the device driver sends a clear faults command every time we
187     // do a read, we will never see them.  So, we'll have to just read the
188     // real time GPI status GPIO.
189 
190     // Check only the GPIs configured on this system.
191     auto& gpiConfigs = std::get<ucd90160::gpiConfigField>(
192         deviceMap.find(getInstance())->second);
193 
194     for (const auto& gpiConfig : gpiConfigs)
195     {
196         auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig);
197         auto doPoll = std::get<ucd90160::pollField>(gpiConfig);
198 
199         // Can skip this one if there is already an error on this input,
200         // or we are polling and these inputs don't need to be polled
201         //(because errors on them are fatal).
202         if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll))
203         {
204             continue;
205         }
206 
207         // The real time status is read via the pin ID
208         auto pinID = std::get<ucd90160::pinIDField>(gpiConfig);
209         auto gpio = gpios.find(pinID);
210         Value gpiStatus;
211 
212         try
213         {
214             // The first time through, create the GPIO objects
215             if (gpio == gpios.end())
216             {
217                 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID,
218                                                             Direction::input));
219                 gpio = gpios.find(pinID);
220             }
221 
222             gpiStatus = gpio->second->read();
223         }
224         catch (std::exception& e)
225         {
226             if (!accessError)
227             {
228                 log<level::ERR>(e.what());
229                 accessError = true;
230             }
231             continue;
232         }
233 
234         if (gpiStatus == Value::low)
235         {
236             // There may be some extra analysis we can do to narrow the
237             // error down further.  Note that finding an error here won't
238             // prevent us from checking this GPI again.
239             errorCreated = doExtraAnalysis(gpiConfig);
240 
241             if (errorCreated)
242             {
243                 continue;
244             }
245 
246             auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
247             auto status = (gpiStatus == Value::low) ? 0 : 1;
248 
249             util::NamesValues nv;
250 
251             try
252             {
253                 nv.add("STATUS_WORD", readStatusWord());
254                 nv.add("MFR_STATUS", readMFRStatus());
255                 nv.add("INPUT_STATUS", status);
256             }
257             catch (device_error::ReadFailure& e)
258             {
259                 log<level::ERR>("ReadFailure when collecting metadata");
260                 commit<device_error::ReadFailure>();
261             }
262 
263             using metadata =
264                 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault;
265 
266             report<power_error::PowerSequencerPGOODFault>(
267                 metadata::INPUT_NUM(gpiNum),
268                 metadata::INPUT_NAME(gpiName.c_str()),
269                 metadata::RAW_STATUS(nv.get().c_str()));
270 
271             setPGOODFaultLogged(gpiNum);
272             errorCreated = true;
273         }
274     }
275 
276     return errorCreated;
277 }
278 
279 void UCD90160::createPowerFaultLog()
280 {
281     util::NamesValues nv;
282 
283     try
284     {
285         nv.add("STATUS_WORD", readStatusWord());
286         nv.add("MFR_STATUS", readMFRStatus());
287     }
288     catch (device_error::ReadFailure& e)
289     {
290         log<level::ERR>("ReadFailure when collecting metadata");
291         commit<device_error::ReadFailure>();
292     }
293 
294     using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault;
295 
296     report<power_error::PowerSequencerFault>(
297         metadata::RAW_STATUS(nv.get().c_str()));
298 }
299 
300 fs::path UCD90160::findGPIODevice(const fs::path& path)
301 {
302     fs::path gpioDevicePath;
303 
304     // In the driver directory, look for a subdirectory
305     // named gpiochipX, where X is some number.  Then
306     // we'll access the GPIO at /dev/gpiochipX.
307     if (fs::is_directory(path))
308     {
309         for (auto& f : fs::directory_iterator(path))
310         {
311             if (f.path().filename().string().find("gpiochip") !=
312                 std::string::npos)
313             {
314                 gpioDevicePath = "/dev" / f.path().filename();
315                 break;
316             }
317         }
318     }
319 
320     if (gpioDevicePath.empty())
321     {
322         log<level::ERR>("Could not find GPIO device path",
323                         entry("BASE_PATH=%s", path.c_str()));
324     }
325 
326     return gpioDevicePath;
327 }
328 
329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
330 {
331 
332     auto type = std::get<ucd90160::extraAnalysisField>(config);
333     if (type == ucd90160::extraAnalysisType::none)
334     {
335         return false;
336     }
337 
338     // Currently the only extra analysis to do is to check other GPIOs.
339     return doGPIOAnalysis(type);
340 }
341 
342 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
343 {
344     bool errorFound = false;
345     bool shutdown = false;
346 
347     const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
348         deviceMap.find(getInstance())->second);
349 
350     auto gpioConfig = analysisConfig.find(type);
351     if (gpioConfig == analysisConfig.end())
352     {
353         return errorFound;
354     }
355 
356     auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second);
357 
358     // The /dev/gpiochipX device
359     auto device = findGPIODevice(path);
360 
361     // The GPIO value of the fault condition
362     auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second);
363 
364     // The GPIOs to check
365     auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second);
366 
367     for (const auto& gpio : gpios)
368     {
369         gpio::Value value;
370 
371         try
372         {
373             GPIO g{device, std::get<ucd90160::gpioNumField>(gpio),
374                    Direction::input};
375 
376             value = g.read();
377         }
378         catch (std::exception& e)
379         {
380             if (!gpioAccessError)
381             {
382                 // GPIO only throws InternalErrors - not worth committing.
383                 log<level::ERR>(
384                     "GPIO read failed while analyzing a power fault",
385                     entry("CHIP_PATH=%s", path.c_str()));
386 
387                 gpioAccessError = true;
388             }
389             continue;
390         }
391 
392         if (value == polarity)
393         {
394             errorFound = true;
395 
396             std::string part{INVENTORY_OBJ_PATH};
397             part = part + std::get<ucd90160::gpioCalloutField>(gpio);
398             PartCallout callout{type, part};
399 
400             if (isPartCalledOut(callout))
401             {
402                 continue;
403             }
404 
405             // Look up and call the error creation function
406             auto logError =
407                 std::get<ucd90160::errorFunctionField>(gpioConfig->second);
408 
409             logError(*this, part);
410 
411             // Save the part callout so we don't call it out again
412             setPartCallout(callout);
413 
414             // Some errors (like overtemps) require a shutdown
415             auto actions = static_cast<uint32_t>(
416                 std::get<ucd90160::optionFlagsField>(gpioConfig->second));
417 
418             if (actions & static_cast<decltype(actions)>(
419                               ucd90160::optionFlags::shutdownOnFault))
420             {
421                 shutdown = true;
422             }
423         }
424     }
425 
426     if (shutdown)
427     {
428         // Will be replaced with a GPU specific error in a future commit
429         util::powerOff<power_error::Shutdown>(bus);
430     }
431 
432     return errorFound;
433 }
434 
435 void UCD90160::gpuPGOODError(const std::string& callout)
436 {
437     util::NamesValues nv;
438 
439     try
440     {
441         nv.add("STATUS_WORD", readStatusWord());
442         nv.add("MFR_STATUS", readMFRStatus());
443     }
444     catch (device_error::ReadFailure& e)
445     {
446         log<level::ERR>("ReadFailure when collecting metadata");
447         commit<device_error::ReadFailure>();
448     }
449 
450     using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault;
451 
452     report<power_error::GPUPowerFault>(
453         metadata::RAW_STATUS(nv.get().c_str()),
454         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
455 }
456 
457 void UCD90160::gpuOverTempError(const std::string& callout)
458 {
459     util::NamesValues nv;
460 
461     try
462     {
463         nv.add("STATUS_WORD", readStatusWord());
464         nv.add("MFR_STATUS", readMFRStatus());
465     }
466     catch (device_error::ReadFailure& e)
467     {
468         log<level::ERR>("ReadFailure when collecting metadata");
469         commit<device_error::ReadFailure>();
470     }
471 
472     using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp;
473 
474     report<power_error::GPUOverTemp>(
475         metadata::RAW_STATUS(nv.get().c_str()),
476         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
477 }
478 
479 } // namespace power
480 } // namespace witherspoon
481