1 /**
2  * Copyright © 2017 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "ucd90160.hpp"
17 
18 #include "names_values.hpp"
19 #include "utility.hpp"
20 
21 #include <elog-errors.hpp>
22 #include <org/open_power/Witherspoon/Fault/error.hpp>
23 #include <phosphor-logging/elog.hpp>
24 #include <phosphor-logging/log.hpp>
25 #include <xyz/openbmc_project/Common/Device/error.hpp>
26 
27 #include <map>
28 #include <memory>
29 
30 namespace phosphor
31 {
32 namespace power
33 {
34 
35 using namespace std::string_literals;
36 
37 const auto MFR_STATUS = "mfr_status"s;
38 
39 const auto DEVICE_NAME = "UCD90160"s;
40 const auto DRIVER_NAME = "ucd9000"s;
41 constexpr auto NUM_PAGES = 16;
42 
43 constexpr auto INVENTORY_OBJ_PATH = "/xyz/openbmc_project/inventory";
44 
45 namespace fs = std::filesystem;
46 using namespace gpio;
47 using namespace pmbus;
48 using namespace phosphor::logging;
49 
50 namespace device_error = sdbusplus::xyz::openbmc_project::Common::Device::Error;
51 namespace power_error = sdbusplus::org::open_power::Witherspoon::Fault::Error;
52 
53 UCD90160::UCD90160(size_t instance, sdbusplus::bus_t& bus) :
54     Device(DEVICE_NAME, instance),
55     interface(std::get<ucd90160::pathField>(deviceMap.find(instance)->second),
56               DRIVER_NAME, instance),
57     gpioDevice(findGPIODevice(interface.path())), bus(bus)
58 {}
59 
60 void UCD90160::onFailure()
61 {
62     try
63     {
64         auto voutError = checkVOUTFaults();
65 
66         auto pgoodError = checkPGOODFaults(false);
67 
68         // Not a voltage or PGOOD fault, but we know something
69         // failed so still create an error log.
70         if (!voutError && !pgoodError)
71         {
72             createPowerFaultLog();
73         }
74     }
75     catch (const device_error::ReadFailure& e)
76     {
77         if (!accessError)
78         {
79             commit<device_error::ReadFailure>();
80             accessError = true;
81         }
82     }
83 }
84 
85 void UCD90160::analyze()
86 {
87     try
88     {
89         // Note: Voltage faults are always fatal, so they just
90         // need to be analyzed in onFailure().
91 
92         checkPGOODFaults(true);
93     }
94     catch (const device_error::ReadFailure& e)
95     {
96         if (!accessError)
97         {
98             commit<device_error::ReadFailure>();
99             accessError = true;
100         }
101     }
102 }
103 
104 uint16_t UCD90160::readStatusWord()
105 {
106     return interface.read(STATUS_WORD, Type::Debug);
107 }
108 
109 uint32_t UCD90160::readMFRStatus()
110 {
111     return interface.read(MFR_STATUS, Type::HwmonDeviceDebug);
112 }
113 
114 bool UCD90160::checkVOUTFaults()
115 {
116     bool errorCreated = false;
117     auto statusWord = readStatusWord();
118 
119     // The status_word register has a summary bit to tell us
120     // if each page even needs to be checked
121     if (!(statusWord & status_word::VOUT_FAULT))
122     {
123         return errorCreated;
124     }
125 
126     for (size_t page = 0; page < NUM_PAGES; page++)
127     {
128         if (isVoutFaultLogged(page))
129         {
130             continue;
131         }
132 
133         auto statusVout = interface.insertPageNum(STATUS_VOUT, page);
134         uint8_t vout = interface.read(statusVout, Type::Debug);
135 
136         // If any bits are on log them, though some are just
137         // warnings so they won't cause errors
138         if (vout)
139         {
140             log<level::INFO>("A voltage rail has bits on in STATUS_VOUT",
141                              entry("STATUS_VOUT=0x%X", vout),
142                              entry("PAGE=%d", page));
143         }
144 
145         // Log errors if any non-warning bits on
146         if (vout & ~status_vout::WARNING_MASK)
147         {
148             auto& railNames = std::get<ucd90160::railNamesField>(
149                 deviceMap.find(getInstance())->second);
150             auto railName = railNames.at(page);
151 
152             util::NamesValues nv;
153             try
154             {
155                 nv.add("STATUS_WORD", statusWord);
156                 nv.add("STATUS_VOUT", vout);
157                 nv.add("MFR_STATUS", readMFRStatus());
158             }
159             catch (const device_error::ReadFailure& e)
160             {
161                 log<level::ERR>("ReadFailure when collecting metadata");
162                 commit<device_error::ReadFailure>();
163             }
164 
165             using metadata =
166                 org::open_power::Witherspoon::Fault::PowerSequencerVoltageFault;
167 
168             report<power_error::PowerSequencerVoltageFault>(
169                 metadata::RAIL(page), metadata::RAIL_NAME(railName.c_str()),
170                 metadata::RAW_STATUS(nv.get().c_str()));
171 
172             setVoutFaultLogged(page);
173             errorCreated = true;
174         }
175     }
176 
177     return errorCreated;
178 }
179 
180 bool UCD90160::checkPGOODFaults(bool polling)
181 {
182     bool errorCreated = false;
183 
184     // While PGOOD faults could show up in MFR_STATUS (and we could then
185     // check the summary bit in STATUS_WORD first), they are edge triggered,
186     // and as the device driver sends a clear faults command every time we
187     // do a read, we will never see them.  So, we'll have to just read the
188     // real time GPI status GPIO.
189 
190     // Check only the GPIs configured on this system.
191     auto& gpiConfigs = std::get<ucd90160::gpiConfigField>(
192         deviceMap.find(getInstance())->second);
193 
194     for (const auto& gpiConfig : gpiConfigs)
195     {
196         auto gpiNum = std::get<ucd90160::gpiNumField>(gpiConfig);
197         auto doPoll = std::get<ucd90160::pollField>(gpiConfig);
198 
199         // Can skip this one if there is already an error on this input,
200         // or we are polling and these inputs don't need to be polled
201         //(because errors on them are fatal).
202         if (isPGOODFaultLogged(gpiNum) || (polling && !doPoll))
203         {
204             continue;
205         }
206 
207         // The real time status is read via the pin ID
208         auto pinID = std::get<ucd90160::pinIDField>(gpiConfig);
209         auto gpio = gpios.find(pinID);
210         Value gpiStatus;
211 
212         try
213         {
214             // The first time through, create the GPIO objects
215             if (gpio == gpios.end())
216             {
217                 gpios.emplace(pinID, std::make_unique<GPIO>(gpioDevice, pinID,
218                                                             Direction::input));
219                 gpio = gpios.find(pinID);
220             }
221 
222             gpiStatus = gpio->second->read();
223         }
224         catch (const std::exception& e)
225         {
226             if (!accessError)
227             {
228                 log<level::ERR>(e.what());
229                 accessError = true;
230             }
231             continue;
232         }
233 
234         if (gpiStatus == Value::low)
235         {
236             // There may be some extra analysis we can do to narrow the
237             // error down further.  Note that finding an error here won't
238             // prevent us from checking this GPI again.
239             errorCreated = doExtraAnalysis(gpiConfig);
240 
241             if (errorCreated)
242             {
243                 continue;
244             }
245 
246             auto& gpiName = std::get<ucd90160::gpiNameField>(gpiConfig);
247             auto status = (gpiStatus == Value::low) ? 0 : 1;
248 
249             util::NamesValues nv;
250 
251             try
252             {
253                 nv.add("STATUS_WORD", readStatusWord());
254                 nv.add("MFR_STATUS", readMFRStatus());
255                 nv.add("INPUT_STATUS", status);
256             }
257             catch (const device_error::ReadFailure& e)
258             {
259                 log<level::ERR>("ReadFailure when collecting metadata");
260                 commit<device_error::ReadFailure>();
261             }
262 
263             using metadata =
264                 org::open_power::Witherspoon::Fault::PowerSequencerPGOODFault;
265 
266             report<power_error::PowerSequencerPGOODFault>(
267                 metadata::INPUT_NUM(gpiNum),
268                 metadata::INPUT_NAME(gpiName.c_str()),
269                 metadata::RAW_STATUS(nv.get().c_str()));
270 
271             setPGOODFaultLogged(gpiNum);
272             errorCreated = true;
273         }
274     }
275 
276     return errorCreated;
277 }
278 
279 void UCD90160::createPowerFaultLog()
280 {
281     util::NamesValues nv;
282 
283     try
284     {
285         nv.add("STATUS_WORD", readStatusWord());
286         nv.add("MFR_STATUS", readMFRStatus());
287     }
288     catch (const device_error::ReadFailure& e)
289     {
290         log<level::ERR>("ReadFailure when collecting metadata");
291         commit<device_error::ReadFailure>();
292     }
293 
294     using metadata = org::open_power::Witherspoon::Fault::PowerSequencerFault;
295 
296     report<power_error::PowerSequencerFault>(
297         metadata::RAW_STATUS(nv.get().c_str()));
298 }
299 
300 fs::path UCD90160::findGPIODevice(const fs::path& path)
301 {
302     fs::path gpioDevicePath;
303 
304     // In the driver directory, look for a subdirectory
305     // named gpiochipX, where X is some number.  Then
306     // we'll access the GPIO at /dev/gpiochipX.
307     if (fs::is_directory(path))
308     {
309         for (auto& f : fs::directory_iterator(path))
310         {
311             if (f.path().filename().string().find("gpiochip") !=
312                 std::string::npos)
313             {
314                 gpioDevicePath = "/dev" / f.path().filename();
315                 break;
316             }
317         }
318     }
319 
320     if (gpioDevicePath.empty())
321     {
322         log<level::ERR>("Could not find GPIO device path",
323                         entry("BASE_PATH=%s", path.c_str()));
324     }
325 
326     return gpioDevicePath;
327 }
328 
329 bool UCD90160::doExtraAnalysis(const ucd90160::GPIConfig& config)
330 {
331 
332     auto type = std::get<ucd90160::extraAnalysisField>(config);
333     if (type == ucd90160::extraAnalysisType::none)
334     {
335         return false;
336     }
337 
338     // Currently the only extra analysis to do is to check other GPIOs.
339     return doGPIOAnalysis(type);
340 }
341 
342 bool UCD90160::doGPIOAnalysis(ucd90160::extraAnalysisType type)
343 {
344     bool errorFound = false;
345     bool shutdown = false;
346 
347     const auto& analysisConfig = std::get<ucd90160::gpioAnalysisField>(
348         deviceMap.find(getInstance())->second);
349 
350     auto gpioConfig = analysisConfig.find(type);
351     if (gpioConfig == analysisConfig.end())
352     {
353         return errorFound;
354     }
355 
356     auto path = std::get<ucd90160::gpioDevicePathField>(gpioConfig->second);
357 
358     // The /dev/gpiochipX device
359     auto device = findGPIODevice(path);
360 
361     if (device.empty())
362     {
363         log<level::ERR>(
364             "Missing GPIO device - cannot do GPIO analysis of fault",
365             entry("ANALYSIS_TYPE=%d\n", type));
366         return errorFound;
367     }
368 
369     // The GPIO value of the fault condition
370     auto polarity = std::get<ucd90160::gpioPolarityField>(gpioConfig->second);
371 
372     // The GPIOs to check
373     auto& gpios = std::get<ucd90160::gpioDefinitionField>(gpioConfig->second);
374 
375     for (const auto& gpio : gpios)
376     {
377         gpio::Value value;
378 
379         try
380         {
381             GPIO g{device, std::get<ucd90160::gpioNumField>(gpio),
382                    Direction::input};
383 
384             value = g.read();
385         }
386         catch (const std::exception& e)
387         {
388             if (!gpioAccessError)
389             {
390                 // GPIO only throws InternalErrors - not worth committing.
391                 log<level::ERR>(
392                     "GPIO read failed while analyzing a power fault",
393                     entry("CHIP_PATH=%s", path.c_str()));
394 
395                 gpioAccessError = true;
396             }
397             continue;
398         }
399 
400         if (value == polarity)
401         {
402             errorFound = true;
403 
404             std::string part{INVENTORY_OBJ_PATH};
405             part = part + std::get<ucd90160::gpioCalloutField>(gpio);
406             PartCallout callout{type, part};
407 
408             if (isPartCalledOut(callout))
409             {
410                 continue;
411             }
412 
413             // Look up and call the error creation function
414             auto logError =
415                 std::get<ucd90160::errorFunctionField>(gpioConfig->second);
416 
417             logError(*this, part);
418 
419             // Save the part callout so we don't call it out again
420             setPartCallout(callout);
421 
422             // Some errors (like overtemps) require a shutdown
423             auto actions = static_cast<uint32_t>(
424                 std::get<ucd90160::optionFlagsField>(gpioConfig->second));
425 
426             if (actions & static_cast<decltype(actions)>(
427                               ucd90160::optionFlags::shutdownOnFault))
428             {
429                 shutdown = true;
430             }
431         }
432     }
433 
434     if (shutdown)
435     {
436         // Will be replaced with a GPU specific error in a future commit
437         util::powerOff<power_error::Shutdown>(bus);
438     }
439 
440     return errorFound;
441 }
442 
443 void UCD90160::gpuPGOODError(const std::string& callout)
444 {
445     util::NamesValues nv;
446 
447     try
448     {
449         nv.add("STATUS_WORD", readStatusWord());
450         nv.add("MFR_STATUS", readMFRStatus());
451     }
452     catch (const device_error::ReadFailure& e)
453     {
454         log<level::ERR>("ReadFailure when collecting metadata");
455         commit<device_error::ReadFailure>();
456     }
457 
458     using metadata = org::open_power::Witherspoon::Fault::GPUPowerFault;
459 
460     report<power_error::GPUPowerFault>(
461         metadata::RAW_STATUS(nv.get().c_str()),
462         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
463 }
464 
465 void UCD90160::gpuOverTempError(const std::string& callout)
466 {
467     util::NamesValues nv;
468 
469     try
470     {
471         nv.add("STATUS_WORD", readStatusWord());
472         nv.add("MFR_STATUS", readMFRStatus());
473     }
474     catch (const device_error::ReadFailure& e)
475     {
476         log<level::ERR>("ReadFailure when collecting metadata");
477         commit<device_error::ReadFailure>();
478     }
479 
480     using metadata = org::open_power::Witherspoon::Fault::GPUOverTemp;
481 
482     report<power_error::GPUOverTemp>(
483         metadata::RAW_STATUS(nv.get().c_str()),
484         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
485 }
486 
487 void UCD90160::memGoodError(const std::string& callout)
488 {
489     util::NamesValues nv;
490 
491     try
492     {
493         nv.add("STATUS_WORD", readStatusWord());
494         nv.add("MFR_STATUS", readMFRStatus());
495     }
496     catch (const device_error::ReadFailure& e)
497     {
498         log<level::ERR>("ReadFailure when collecting metadata");
499         commit<device_error::ReadFailure>();
500     }
501 
502     using metadata = org::open_power::Witherspoon::Fault::MemoryPowerFault;
503 
504     report<power_error::MemoryPowerFault>(
505         metadata::RAW_STATUS(nv.get().c_str()),
506         metadata::CALLOUT_INVENTORY_PATH(callout.c_str()));
507 }
508 
509 } // namespace power
510 } // namespace phosphor
511