xref: /openbmc/phosphor-fan-presence/monitor/system.cpp (revision 9ac325c59511fca0e8fc35b0fb575942d42c74df)
1 /**
2  * Copyright © 2021 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "fan.hpp"
19 #include "fan_defs.hpp"
20 #include "tach_sensor.hpp"
21 #include "trust_manager.hpp"
22 #include "types.hpp"
23 #include "utility.hpp"
24 #ifdef MONITOR_USE_JSON
25 #include "json_config.hpp"
26 #include "json_parser.hpp"
27 #endif
28 
29 #include "config.h"
30 
31 #include "hwmon_ffdc.hpp"
32 
33 #include <nlohmann/json.hpp>
34 #include <phosphor-logging/log.hpp>
35 #include <sdbusplus/bus.hpp>
36 #include <sdeventplus/event.hpp>
37 #include <sdeventplus/source/signal.hpp>
38 
39 namespace phosphor::fan::monitor
40 {
41 
42 using json = nlohmann::json;
43 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
44 
45 using namespace phosphor::logging;
46 
47 System::System(Mode mode, sdbusplus::bus::bus& bus,
48                const sdeventplus::Event& event) :
49     _mode(mode),
50     _bus(bus), _event(event),
51     _powerState(std::make_unique<PGoodState>(
52         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
53                        std::placeholders::_1))),
54     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
55 {}
56 
57 void System::start()
58 {
59     namespace match = sdbusplus::bus::match;
60 
61     // must be done before service detection
62     _inventoryMatch = std::make_unique<match::match>(
63         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
64         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
65 
66     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
67         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
68         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
69 
70     if (invServiceRunning)
71     {
72         _inventoryMatch.reset();
73 
74         if (!_loaded)
75         {
76             load();
77         }
78     }
79 }
80 
81 void System::load()
82 {
83     json jsonObj = json::object();
84 #ifdef MONITOR_USE_JSON
85     try
86     {
87         jsonObj = getJsonObj(_bus);
88 #endif
89         auto trustGrps = getTrustGroups(jsonObj);
90         auto fanDefs = getFanDefinitions(jsonObj);
91         // Retrieve and set trust groups within the trust manager
92         setTrustMgr(getTrustGroups(jsonObj));
93         // Clear/set configured fan definitions
94         _fans.clear();
95         _fanHealth.clear();
96         // Retrieve fan definitions and create fan objects to be monitored
97         setFans(fanDefs);
98         setFaultConfig(jsonObj);
99         log<level::INFO>("Configuration loaded");
100 
101         _loaded = true;
102 #ifdef MONITOR_USE_JSON
103     }
104     catch (const phosphor::fan::NoConfigFound&)
105     {}
106 #endif
107 
108     if (_powerState->isPowerOn())
109     {
110         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
111                       [this](auto& rule) {
112                           rule->check(PowerRuleState::runtime, _fanHealth);
113                       });
114     }
115 
116     subscribeSensorsToServices();
117 }
118 
119 void System::subscribeSensorsToServices()
120 {
121     namespace match = sdbusplus::bus::match;
122 
123     _sensorMatch.clear();
124 
125     SensorMapType sensorMap;
126 
127     // build a list of all interfaces, always including the value interface
128     // using set automatically guards against duplicates
129     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
130 
131     for (const auto& fan : _fans)
132     {
133         for (const auto& sensor : fan->sensors())
134         {
135             unique_interfaces.insert(sensor->getInterface());
136         }
137     }
138     // convert them to vector to pass into getSubTreeRaw
139     std::vector<std::string> interfaces(unique_interfaces.begin(),
140                                         unique_interfaces.end());
141 
142     try
143     {
144         // get service information for all service names that are
145         // hosting these interfaces
146         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
147             _bus, FAN_SENSOR_PATH, interfaces, 0);
148 
149         for (const auto& fan : _fans)
150         {
151             // For every sensor in each fan
152             for (const auto& sensor : fan->sensors())
153             {
154                 const auto itServ = serviceObjects.find(sensor->name());
155 
156                 if (serviceObjects.end() == itServ || itServ->second.empty())
157                 {
158                     getLogger().log(
159                         fmt::format("Fan sensor entry {} not found in D-Bus",
160                                     sensor->name()),
161                         Logger::error);
162                     continue;
163                 }
164 
165                 for (const auto& [serviceName, unused] : itServ->second)
166                 {
167                     // associate service name with sensor
168                     sensorMap[serviceName].insert(sensor);
169                 }
170             }
171         }
172 
173         // only create 1 match per service
174         for (const auto& [serviceName, unused] : sensorMap)
175         {
176             // map its service name to the sensor
177             _sensorMatch.emplace_back(std::make_unique<match::match>(
178                 _bus, match::rules::nameOwnerChanged(serviceName),
179                 std::bind(&System::tachSignalOffline, this,
180                           std::placeholders::_1, sensorMap)));
181         }
182     }
183     catch (const util::DBusError&)
184     {
185         // catch exception from getSubTreeRaw() when fan sensor paths don't
186         // exist yet
187     }
188 }
189 
190 void System::inventoryOnlineCb(sdbusplus::message::message& msg)
191 {
192     namespace match = sdbusplus::bus::match;
193 
194     std::string iface;
195     msg.read(iface);
196 
197     if (util::INVENTORY_INTF != iface)
198     {
199         return;
200     }
201 
202     std::string oldName;
203     msg.read(oldName);
204 
205     std::string newName;
206     msg.read(newName);
207 
208     // newName should never be empty since match was reset on the first
209     // nameOwnerChanged signal received from the service.
210     if (!_loaded && !newName.empty())
211     {
212         load();
213     }
214 
215     // cancel any further notifications about the service state
216     _inventoryMatch.reset();
217 }
218 
219 void System::sighupHandler(sdeventplus::source::Signal&,
220                            const struct signalfd_siginfo*)
221 {
222     try
223     {
224         load();
225     }
226     catch (std::runtime_error& re)
227     {
228         log<level::ERR>("Error reloading config, no config changes made",
229                         entry("LOAD_ERROR=%s", re.what()));
230     }
231 }
232 
233 const std::vector<CreateGroupFunction>
234     System::getTrustGroups(const json& jsonObj)
235 {
236 #ifdef MONITOR_USE_JSON
237     return getTrustGrps(jsonObj);
238 #else
239     return trustGroups;
240 #endif
241 }
242 
243 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
244 {
245     _trust = std::make_unique<trust::Manager>(groupFuncs);
246 }
247 
248 const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj)
249 {
250 #ifdef MONITOR_USE_JSON
251     return getFanDefs(jsonObj);
252 #else
253     return fanDefinitions;
254 #endif
255 }
256 
257 void System::setFans(const std::vector<FanDefinition>& fanDefs)
258 {
259     for (const auto& fanDef : fanDefs)
260     {
261         // Check if a condition exists on the fan
262         auto condition = std::get<conditionField>(fanDef);
263         if (condition)
264         {
265             // Condition exists, skip adding fan if it fails
266             if (!(*condition)(_bus))
267             {
268                 continue;
269             }
270         }
271         _fans.emplace_back(
272             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
273 
274         updateFanHealth(*(_fans.back()));
275     }
276 }
277 
278 // callback indicating a service went [on|off]line.
279 // Determine on/offline status, set all sensors for that service
280 // to new state
281 //
282 void System::tachSignalOffline(sdbusplus::message::message& msg,
283                                SensorMapType const& sensorMap)
284 {
285     std::string serviceName, oldOwner, newOwner;
286 
287     msg.read(serviceName);
288     msg.read(oldOwner);
289     msg.read(newOwner);
290 
291     // true if sensor server came back online, false -> went offline
292     bool hasOwner = !newOwner.empty() && oldOwner.empty();
293 
294     std::string stateStr(hasOwner ? "online" : "offline");
295     getLogger().log(fmt::format("Changing sensors for service {} to {}",
296                                 serviceName, stateStr),
297                     Logger::info);
298 
299     auto sensorItr(sensorMap.find(serviceName));
300 
301     if (sensorItr != sensorMap.end())
302     {
303         // set all sensors' owner state to not-owned
304         for (auto& sensor : sensorItr->second)
305         {
306             sensor->setOwner(hasOwner);
307             sensor->getFan().process(*sensor);
308         }
309     }
310 }
311 
312 void System::updateFanHealth(const Fan& fan)
313 {
314     std::vector<bool> sensorStatus;
315     for (const auto& sensor : fan.sensors())
316     {
317         sensorStatus.push_back(sensor->functional());
318     }
319 
320     _fanHealth[fan.getName()] =
321         std::make_tuple(fan.present(), std::move(sensorStatus));
322 }
323 
324 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
325 {
326     updateFanHealth(fan);
327 
328     if (_powerState->isPowerOn() && !skipRulesCheck)
329     {
330         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
331                       [this](auto& rule) {
332                           rule->check(PowerRuleState::runtime, _fanHealth);
333                       });
334     }
335 }
336 
337 void System::setFaultConfig(const json& jsonObj)
338 {
339 #ifdef MONITOR_USE_JSON
340     std::shared_ptr<PowerInterfaceBase> powerInterface =
341         std::make_shared<PowerInterface>(_thermalAlert);
342 
343     PowerOffAction::PrePowerOffFunc func =
344         std::bind(std::mem_fn(&System::logShutdownError), this);
345 
346     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
347 
348     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
349 #endif
350 }
351 
352 void System::powerStateChanged(bool powerStateOn)
353 {
354     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
355         fan->powerStateChanged(powerStateOn);
356     });
357 
358     if (powerStateOn)
359     {
360         if (!_loaded)
361         {
362             log<level::ERR>("No conf file found at power on");
363             throw std::runtime_error("No conf file found at power on");
364         }
365 
366         // If no fan has its sensors on D-Bus, then there is a problem
367         // with the fan controller.  Log an error and shut down.
368         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
369                 return fan->numSensorsOnDBusAtPowerOn() == 0;
370             }))
371         {
372             handleOfflineFanController();
373             return;
374         }
375 
376         if (_sensorMatch.empty())
377         {
378             subscribeSensorsToServices();
379         }
380 
381         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
382                       [this](auto& rule) {
383                           rule->check(PowerRuleState::atPgood, _fanHealth);
384                       });
385         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
386                       [this](auto& rule) {
387                           rule->check(PowerRuleState::runtime, _fanHealth);
388                       });
389     }
390     else
391     {
392         _thermalAlert.enabled(false);
393 
394         // Cancel any in-progress power off actions
395         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
396                       [this](auto& rule) { rule->cancel(); });
397     }
398 }
399 
400 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
401 {
402     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
403 
404     getLogger().log(
405         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
406                     sensor.name()),
407         Logger::error);
408 
409     // In order to know if the event log should have a severity of error or
410     // informational, count the number of existing nonfunctional sensors and
411     // compare it to _numNonfuncSensorsBeforeError.
412     size_t nonfuncSensors = 0;
413     for (const auto& fan : _fans)
414     {
415         for (const auto& s : fan->sensors())
416         {
417             // Don't count nonfunctional sensors that still have their
418             // error timer running as nonfunctional since they haven't
419             // had event logs created for those errors yet.
420             if (!s->functional() && !s->errorTimerRunning())
421             {
422                 nonfuncSensors++;
423             }
424         }
425     }
426 
427     Severity severity = Severity::Error;
428     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
429     {
430         severity = Severity::Informational;
431     }
432 
433     auto error =
434         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
435                                    fanPath, sensor.name(), severity);
436 
437     auto sensorData = captureSensorData();
438     error->commit(sensorData);
439 
440     // Save the error so it can be committed again on a power off.
441     _lastError = std::move(error);
442 }
443 
444 void System::fanMissingErrorTimerExpired(const Fan& fan)
445 {
446     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
447 
448     getLogger().log(
449         fmt::format("Creating event log for missing fan {}", fanPath),
450         Logger::error);
451 
452     auto error = std::make_unique<FanError>(
453         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
454 
455     auto sensorData = captureSensorData();
456     error->commit(sensorData);
457 
458     // Save the error so it can be committed again on a power off.
459     _lastError = std::move(error);
460 }
461 
462 void System::logShutdownError()
463 {
464     if (_lastError)
465     {
466         getLogger().log("Re-committing previous fan error before power off");
467 
468         // Still use the latest sensor data
469         auto sensorData = captureSensorData();
470         _lastError->commit(sensorData, true);
471     }
472 }
473 
474 json System::captureSensorData()
475 {
476     json data;
477 
478     for (const auto& fan : _fans)
479     {
480         for (const auto& sensor : fan->sensors())
481         {
482             json values;
483             values["present"] = fan->present();
484             values["functional"] = sensor->functional();
485             values["tach"] = sensor->getInput();
486             if (sensor->hasTarget())
487             {
488                 values["target"] = sensor->getTarget();
489             }
490 
491             data["sensors"][sensor->name()] = values;
492         }
493     }
494 
495     return data;
496 }
497 
498 void System::handleOfflineFanController()
499 {
500     getLogger().log("The fan controller appears to be offline.  Shutting down.",
501                     Logger::error);
502 
503     auto ffdc = collectHwmonFFDC();
504 
505     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
506                    Severity::Critical};
507     error.commit(ffdc, true);
508 
509     PowerInterface::executeHardPowerOff();
510 
511     createBmcDump();
512 }
513 
514 /**
515  * @brief Create a BMC Dump
516  */
517 void System::createBmcDump() const
518 {
519     try
520     {
521         util::SDBusPlus::callMethod(
522             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
523             "xyz.openbmc_project.Dump.Create", "CreateDump",
524             std::vector<
525                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
526     }
527     catch (const sdbusplus::exception::exception&)
528     {}
529 }
530 
531 } // namespace phosphor::fan::monitor
532