xref: /openbmc/phosphor-fan-presence/monitor/system.cpp (revision 9d533806250cea56406bdd39e025f0d820c4ed90)
1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29 
30 #include "config.h"
31 
32 #include "hwmon_ffdc.hpp"
33 
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/lg2.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40 
41 namespace phosphor::fan::monitor
42 {
43 
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46 
47 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
48 
System(Mode mode,sdbusplus::bus_t & bus,const sdeventplus::Event & event)49 System::System(Mode mode, sdbusplus::bus_t& bus,
50                const sdeventplus::Event& event) :
51     _mode(mode), _bus(bus), _event(event),
52 #ifdef MONITOR_USE_HOST_STATE
53     _powerState(std::make_unique<HostPowerState>(
54 #else
55     _powerState(std::make_unique<PGoodState>(
56 #endif
57         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
58                        std::placeholders::_1))),
59     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
60 {}
61 
start()62 void System::start()
63 {
64     namespace match = sdbusplus::bus::match;
65 
66     // must be done before service detection
67     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
68         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
69         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
70 
71     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
72         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
73         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
74 
75     if (invServiceRunning)
76     {
77         _inventoryMatch.reset();
78 
79         if (!_loaded)
80         {
81             load();
82         }
83     }
84 }
85 
load()86 void System::load()
87 {
88     json jsonObj = json::object();
89 #ifdef MONITOR_USE_JSON
90     try
91     {
92         jsonObj = getJsonObj();
93 #endif
94         auto trustGrps = getTrustGroups(jsonObj);
95         auto fanDefs = getFanDefinitions(jsonObj);
96         // Retrieve and set trust groups within the trust manager
97         setTrustMgr(getTrustGroups(jsonObj));
98         // Clear/set configured fan definitions
99         _fans.clear();
100         _fanHealth.clear();
101         // Retrieve fan definitions and create fan objects to be monitored
102         setFans(fanDefs);
103         setFaultConfig(jsonObj);
104         lg2::info("Configuration loaded");
105 
106         _loaded = true;
107 #ifdef MONITOR_USE_JSON
108     }
109     catch (const phosphor::fan::NoConfigFound&)
110     {}
111 #endif
112 
113     if (_powerState->isPowerOn())
114     {
115         // Fans could be missing on startup, so check the power off rules.
116         // Tach sensors default to functional, so they wouldn't cause a power
117         // off here.
118         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
119                       [this](auto& rule) {
120                           rule->check(PowerRuleState::runtime, _fanHealth);
121                       });
122     }
123 
124     subscribeSensorsToServices();
125 }
126 
subscribeSensorsToServices()127 void System::subscribeSensorsToServices()
128 {
129     namespace match = sdbusplus::bus::match;
130 
131     _sensorMatch.clear();
132 
133     SensorMapType sensorMap;
134 
135     // build a list of all interfaces, always including the value interface
136     // using set automatically guards against duplicates
137     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
138 
139     for (const auto& fan : _fans)
140     {
141         for (const auto& sensor : fan->sensors())
142         {
143             unique_interfaces.insert(sensor->getInterface());
144         }
145     }
146     // convert them to vector to pass into getSubTreeRaw
147     std::vector<std::string> interfaces(unique_interfaces.begin(),
148                                         unique_interfaces.end());
149 
150     try
151     {
152         // get service information for all service names that are
153         // hosting these interfaces
154         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
155             _bus, FAN_SENSOR_PATH, interfaces, 0);
156 
157         for (const auto& fan : _fans)
158         {
159             // For every sensor in each fan
160             for (const auto& sensor : fan->sensors())
161             {
162                 const auto itServ = serviceObjects.find(sensor->name());
163 
164                 if (serviceObjects.end() == itServ || itServ->second.empty())
165                 {
166                     getLogger().log(
167                         std::format("Fan sensor entry {} not found in D-Bus",
168                                     sensor->name()),
169                         Logger::error);
170                     continue;
171                 }
172 
173                 for (const auto& [serviceName, unused] : itServ->second)
174                 {
175                     // associate service name with sensor
176                     sensorMap[serviceName].insert(sensor);
177                 }
178             }
179         }
180 
181         // only create 1 match per service
182         for (const auto& [serviceName, unused] : sensorMap)
183         {
184             // map its service name to the sensor
185             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
186                 _bus, match::rules::nameOwnerChanged(serviceName),
187                 std::bind(&System::tachSignalOffline, this,
188                           std::placeholders::_1, sensorMap)));
189         }
190     }
191     catch (const util::DBusError&)
192     {
193         // catch exception from getSubTreeRaw() when fan sensor paths don't
194         // exist yet
195     }
196 }
197 
inventoryOnlineCb(sdbusplus::message_t & msg)198 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
199 {
200     namespace match = sdbusplus::bus::match;
201 
202     std::string iface;
203     msg.read(iface);
204 
205     if (util::INVENTORY_INTF != iface)
206     {
207         return;
208     }
209 
210     std::string oldName;
211     msg.read(oldName);
212 
213     std::string newName;
214     msg.read(newName);
215 
216     // newName should never be empty since match was reset on the first
217     // nameOwnerChanged signal received from the service.
218     if (!_loaded && !newName.empty())
219     {
220         load();
221     }
222 
223     // cancel any further notifications about the service state
224     _inventoryMatch.reset();
225 }
226 
sighupHandler(sdeventplus::source::Signal &,const struct signalfd_siginfo *)227 void System::sighupHandler(sdeventplus::source::Signal&,
228                            const struct signalfd_siginfo*)
229 {
230     try
231     {
232         load();
233     }
234     catch (std::runtime_error& re)
235     {
236         lg2::error(
237             "Error reloading config, no config changes made: {LOAD_ERROR}",
238             "LOAD_ERROR", re);
239     }
240 }
241 
getTrustGroups(const json & jsonObj)242 const std::vector<CreateGroupFunction> System::getTrustGroups(
243     [[maybe_unused]] const json& jsonObj)
244 {
245 #ifdef MONITOR_USE_JSON
246     return getTrustGrps(jsonObj);
247 #else
248     return trustGroups;
249 #endif
250 }
251 
setTrustMgr(const std::vector<CreateGroupFunction> & groupFuncs)252 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
253 {
254     _trust = std::make_unique<trust::Manager>(groupFuncs);
255 }
256 
getFanDefinitions(const json & jsonObj)257 const std::vector<FanDefinition> System::getFanDefinitions(
258     [[maybe_unused]] const json& jsonObj)
259 {
260 #ifdef MONITOR_USE_JSON
261     return getFanDefs(jsonObj);
262 #else
263     return fanDefinitions;
264 #endif
265 }
266 
setFans(const std::vector<FanDefinition> & fanDefs)267 void System::setFans(const std::vector<FanDefinition>& fanDefs)
268 {
269     for (const auto& fanDef : fanDefs)
270     {
271         // Check if a condition exists on the fan
272         auto condition = fanDef.condition;
273         if (condition)
274         {
275             // Condition exists, skip adding fan if it fails
276             if (!(*condition)(_bus))
277             {
278                 continue;
279             }
280         }
281         _fans.emplace_back(
282             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
283 
284         updateFanHealth(*(_fans.back()));
285     }
286 }
287 
288 // callback indicating a service went [on|off]line.
289 // Determine on/offline status, set all sensors for that service
290 // to new state
291 //
tachSignalOffline(sdbusplus::message_t & msg,const SensorMapType & sensorMap)292 void System::tachSignalOffline(sdbusplus::message_t& msg,
293                                const SensorMapType& sensorMap)
294 {
295     std::string serviceName, oldOwner, newOwner;
296 
297     msg.read(serviceName);
298     msg.read(oldOwner);
299     msg.read(newOwner);
300 
301     // true if sensor server came back online, false -> went offline
302     bool hasOwner = !newOwner.empty() && oldOwner.empty();
303 
304     std::string stateStr(hasOwner ? "online" : "offline");
305     getLogger().log(std::format("Changing sensors for service {} to {}",
306                                 serviceName, stateStr),
307                     Logger::info);
308 
309     auto sensorItr(sensorMap.find(serviceName));
310 
311     if (sensorItr != sensorMap.end())
312     {
313         // set all sensors' owner state to not-owned
314         for (auto& sensor : sensorItr->second)
315         {
316             sensor->setOwner(hasOwner);
317             sensor->getFan().process(*sensor);
318         }
319     }
320 }
321 
updateFanHealth(const Fan & fan)322 void System::updateFanHealth(const Fan& fan)
323 {
324     std::vector<bool> sensorStatus;
325     for (const auto& sensor : fan.sensors())
326     {
327         sensorStatus.push_back(sensor->functional());
328     }
329 
330     _fanHealth[fan.getName()] =
331         std::make_tuple(fan.present(), std::move(sensorStatus));
332 }
333 
fanStatusChange(const Fan & fan,bool skipRulesCheck)334 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
335 {
336     updateFanHealth(fan);
337 
338     if (_powerState->isPowerOn() && !skipRulesCheck)
339     {
340         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
341                       [this](auto& rule) {
342                           rule->check(PowerRuleState::runtime, _fanHealth);
343                       });
344     }
345 }
346 
setFaultConfig(const json & jsonObj)347 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
348 {
349 #ifdef MONITOR_USE_JSON
350     std::shared_ptr<PowerInterfaceBase> powerInterface =
351         std::make_shared<PowerInterface>(_thermalAlert);
352 
353     PowerOffAction::PrePowerOffFunc func =
354         std::bind(std::mem_fn(&System::logShutdownError), this);
355 
356     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
357 
358     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
359 #endif
360 }
361 
powerStateChanged(bool powerStateOn)362 void System::powerStateChanged(bool powerStateOn)
363 {
364     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
365         fan->powerStateChanged(powerStateOn);
366     });
367 
368     if (powerStateOn)
369     {
370         if (!_loaded)
371         {
372             lg2::error("No conf file found at power on");
373             throw std::runtime_error("No conf file found at power on");
374         }
375 
376         // If no fan has its sensors on D-Bus, then there is a problem
377         // with the fan controller.  Log an error and shut down.
378         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
379                 return fan->numSensorsOnDBusAtPowerOn() == 0;
380             }))
381         {
382 #if DELAY_HOST_CONTROL > 0
383             sleep(DELAY_HOST_CONTROL);
384             std::for_each(_fans.begin(), _fans.end(),
385                           [powerStateOn](auto& fan) {
386                               fan->powerStateChanged(powerStateOn);
387                           });
388             if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
389                     return fan->numSensorsOnDBusAtPowerOn() == 0;
390                 }))
391             {
392                 handleOfflineFanController();
393                 return;
394             }
395 #else
396             handleOfflineFanController();
397             return;
398 #endif
399         }
400 
401         if (_sensorMatch.empty())
402         {
403             subscribeSensorsToServices();
404         }
405 
406         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
407                       [this](auto& rule) {
408                           rule->check(PowerRuleState::atPgood, _fanHealth);
409                       });
410         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
411                       [this](auto& rule) {
412                           rule->check(PowerRuleState::runtime, _fanHealth);
413                       });
414     }
415     else
416     {
417         _thermalAlert.enabled(false);
418 
419         // Cancel any in-progress power off actions
420         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
421                       [this](auto& rule) { rule->cancel(); });
422     }
423 }
424 
sensorErrorTimerExpired(const Fan & fan,const TachSensor & sensor)425 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
426 {
427     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
428 
429     getLogger().log(
430         std::format("Creating event log for faulted fan {} sensor {}", fanPath,
431                     sensor.name()),
432         Logger::error);
433 
434     // In order to know if the event log should have a severity of error or
435     // informational, count the number of existing nonfunctional sensors and
436     // compare it to _numNonfuncSensorsBeforeError.
437     size_t nonfuncSensors = 0;
438     for (const auto& fan : _fans)
439     {
440         for (const auto& s : fan->sensors())
441         {
442             // Don't count nonfunctional sensors that still have their
443             // error timer running as nonfunctional since they haven't
444             // had event logs created for those errors yet.
445             if (!s->functional() && !s->errorTimerRunning())
446             {
447                 nonfuncSensors++;
448             }
449         }
450     }
451 
452     Severity severity = Severity::Error;
453     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
454     {
455         severity = Severity::Informational;
456     }
457 
458     auto error =
459         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
460                                    fanPath, sensor.name(), severity);
461 
462     auto sensorData = captureSensorData();
463     error->commit(sensorData);
464 
465     // Save the error so it can be committed again on a power off.
466     _lastError = std::move(error);
467 }
468 
fanMissingErrorTimerExpired(const Fan & fan)469 void System::fanMissingErrorTimerExpired(const Fan& fan)
470 {
471     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
472 
473     getLogger().log(
474         std::format("Creating event log for missing fan {}", fanPath),
475         Logger::error);
476 
477     auto error = std::make_unique<FanError>(
478         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
479 
480     auto sensorData = captureSensorData();
481     error->commit(sensorData);
482 
483     // Save the error so it can be committed again on a power off.
484     _lastError = std::move(error);
485 }
486 
logShutdownError()487 void System::logShutdownError()
488 {
489     if (_lastError)
490     {
491         getLogger().log("Re-committing previous fan error before power off");
492 
493         // Still use the latest sensor data
494         auto sensorData = captureSensorData();
495         _lastError->commit(sensorData, true);
496     }
497 }
498 
captureSensorData()499 json System::captureSensorData()
500 {
501     json data;
502 
503     for (const auto& fan : _fans)
504     {
505         for (const auto& sensor : fan->sensors())
506         {
507             json values;
508             values["present"] = fan->present();
509             values["functional"] = sensor->functional();
510             values["in_range"] = !fan->outOfRange(*sensor);
511             values["tach"] = sensor->getInput();
512 
513             if (sensor->hasTarget())
514             {
515                 values["target"] = sensor->getTarget();
516             }
517 
518             // convert between string/json to remove newlines
519             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
520 
521             if (sensor->hasTarget())
522             {
523                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
524             }
525 
526             if (sensor->getMethod() == MethodMode::count)
527             {
528                 values["ticks"] = sensor->getCounter();
529             }
530             data["sensors"][sensor->name()] = values;
531         }
532     }
533 
534     return data;
535 }
536 
handleOfflineFanController()537 void System::handleOfflineFanController()
538 {
539     getLogger().log("The fan controller appears to be offline.  Shutting down.",
540                     Logger::error);
541 
542     auto ffdc = collectHwmonFFDC();
543 
544     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
545                    Severity::Critical};
546     error.commit(ffdc, true);
547 
548     PowerInterface::executeHardPowerOff();
549 
550     createBmcDump();
551 }
552 
553 /**
554  * @brief Create a BMC Dump
555  */
createBmcDump() const556 void System::createBmcDump() const
557 {
558     try
559     {
560         util::SDBusPlus::callMethod(
561             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
562             "xyz.openbmc_project.Dump.Create", "CreateDump",
563             std::vector<
564                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
565     }
566     catch (const std::exception& e)
567     {
568         getLogger().log(
569             std::format("Caught exception while creating BMC dump: {}",
570                         e.what()),
571             Logger::error);
572     }
573 }
574 
dumpDebugData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)575 void System::dumpDebugData(sdeventplus::source::Signal&,
576                            const struct signalfd_siginfo*)
577 {
578     json output;
579 
580     if (_loaded)
581     {
582         output["logs"] = getLogger().getLogs();
583         output["sensors"] = captureSensorData();
584     }
585     else
586     {
587         output["error"] = "Fan monitor not loaded yet.  Try again later.";
588     }
589 
590     std::ofstream file{System::dumpFile};
591     if (!file)
592     {
593         lg2::error("Could not open file for fan monitor dump");
594     }
595     else
596     {
597         file << std::setw(4) << output;
598     }
599 }
600 
601 } // namespace phosphor::fan::monitor
602