xref: /openbmc/phosphor-fan-presence/monitor/system.cpp (revision a081956ff4ad2f4d13206b7e80d0587ba187c15b)
1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "fan.hpp"
19 #include "fan_defs.hpp"
20 #include "tach_sensor.hpp"
21 #include "trust_manager.hpp"
22 #include "types.hpp"
23 #include "utility.hpp"
24 #ifdef MONITOR_USE_JSON
25 #include "json_config.hpp"
26 #include "json_parser.hpp"
27 #endif
28 
29 #include "config.h"
30 
31 #include "hwmon_ffdc.hpp"
32 
33 #include <nlohmann/json.hpp>
34 #include <phosphor-logging/log.hpp>
35 #include <sdbusplus/bus.hpp>
36 #include <sdbusplus/bus/match.hpp>
37 #include <sdeventplus/event.hpp>
38 #include <sdeventplus/source/signal.hpp>
39 
40 namespace phosphor::fan::monitor
41 {
42 
43 using json = nlohmann::json;
44 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
45 
46 using namespace phosphor::logging;
47 
48 System::System(Mode mode, sdbusplus::bus_t& bus,
49                const sdeventplus::Event& event) :
50     _mode(mode),
51     _bus(bus), _event(event),
52     _powerState(std::make_unique<PGoodState>(
53         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
54                        std::placeholders::_1))),
55     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
56 {}
57 
58 void System::start()
59 {
60     namespace match = sdbusplus::bus::match;
61 
62     // must be done before service detection
63     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
64         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
65         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
66 
67     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
68         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
69         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
70 
71     if (invServiceRunning)
72     {
73         _inventoryMatch.reset();
74 
75         if (!_loaded)
76         {
77             load();
78         }
79     }
80 }
81 
82 void System::load()
83 {
84     json jsonObj = json::object();
85 #ifdef MONITOR_USE_JSON
86     try
87     {
88         jsonObj = getJsonObj();
89 #endif
90         auto trustGrps = getTrustGroups(jsonObj);
91         auto fanDefs = getFanDefinitions(jsonObj);
92         // Retrieve and set trust groups within the trust manager
93         setTrustMgr(getTrustGroups(jsonObj));
94         // Clear/set configured fan definitions
95         _fans.clear();
96         _fanHealth.clear();
97         // Retrieve fan definitions and create fan objects to be monitored
98         setFans(fanDefs);
99         setFaultConfig(jsonObj);
100         log<level::INFO>("Configuration loaded");
101 
102         _loaded = true;
103 #ifdef MONITOR_USE_JSON
104     }
105     catch (const phosphor::fan::NoConfigFound&)
106     {}
107 #endif
108 
109     if (_powerState->isPowerOn())
110     {
111         // Fans could be missing on startup, so check the power off rules.
112         // Tach sensors default to functional, so they wouldn't cause a power
113         // off here.
114         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
115                       [this](auto& rule) {
116                           rule->check(PowerRuleState::runtime, _fanHealth);
117                       });
118     }
119 
120     subscribeSensorsToServices();
121 }
122 
123 void System::subscribeSensorsToServices()
124 {
125     namespace match = sdbusplus::bus::match;
126 
127     _sensorMatch.clear();
128 
129     SensorMapType sensorMap;
130 
131     // build a list of all interfaces, always including the value interface
132     // using set automatically guards against duplicates
133     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
134 
135     for (const auto& fan : _fans)
136     {
137         for (const auto& sensor : fan->sensors())
138         {
139             unique_interfaces.insert(sensor->getInterface());
140         }
141     }
142     // convert them to vector to pass into getSubTreeRaw
143     std::vector<std::string> interfaces(unique_interfaces.begin(),
144                                         unique_interfaces.end());
145 
146     try
147     {
148         // get service information for all service names that are
149         // hosting these interfaces
150         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
151             _bus, FAN_SENSOR_PATH, interfaces, 0);
152 
153         for (const auto& fan : _fans)
154         {
155             // For every sensor in each fan
156             for (const auto& sensor : fan->sensors())
157             {
158                 const auto itServ = serviceObjects.find(sensor->name());
159 
160                 if (serviceObjects.end() == itServ || itServ->second.empty())
161                 {
162                     getLogger().log(
163                         fmt::format("Fan sensor entry {} not found in D-Bus",
164                                     sensor->name()),
165                         Logger::error);
166                     continue;
167                 }
168 
169                 for (const auto& [serviceName, unused] : itServ->second)
170                 {
171                     // associate service name with sensor
172                     sensorMap[serviceName].insert(sensor);
173                 }
174             }
175         }
176 
177         // only create 1 match per service
178         for (const auto& [serviceName, unused] : sensorMap)
179         {
180             // map its service name to the sensor
181             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
182                 _bus, match::rules::nameOwnerChanged(serviceName),
183                 std::bind(&System::tachSignalOffline, this,
184                           std::placeholders::_1, sensorMap)));
185         }
186     }
187     catch (const util::DBusError&)
188     {
189         // catch exception from getSubTreeRaw() when fan sensor paths don't
190         // exist yet
191     }
192 }
193 
194 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
195 {
196     namespace match = sdbusplus::bus::match;
197 
198     std::string iface;
199     msg.read(iface);
200 
201     if (util::INVENTORY_INTF != iface)
202     {
203         return;
204     }
205 
206     std::string oldName;
207     msg.read(oldName);
208 
209     std::string newName;
210     msg.read(newName);
211 
212     // newName should never be empty since match was reset on the first
213     // nameOwnerChanged signal received from the service.
214     if (!_loaded && !newName.empty())
215     {
216         load();
217     }
218 
219     // cancel any further notifications about the service state
220     _inventoryMatch.reset();
221 }
222 
223 void System::sighupHandler(sdeventplus::source::Signal&,
224                            const struct signalfd_siginfo*)
225 {
226     try
227     {
228         load();
229     }
230     catch (std::runtime_error& re)
231     {
232         log<level::ERR>("Error reloading config, no config changes made",
233                         entry("LOAD_ERROR=%s", re.what()));
234     }
235 }
236 
237 const std::vector<CreateGroupFunction>
238     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
239 {
240 #ifdef MONITOR_USE_JSON
241     return getTrustGrps(jsonObj);
242 #else
243     return trustGroups;
244 #endif
245 }
246 
247 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
248 {
249     _trust = std::make_unique<trust::Manager>(groupFuncs);
250 }
251 
252 const std::vector<FanDefinition>
253     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
254 {
255 #ifdef MONITOR_USE_JSON
256     return getFanDefs(jsonObj);
257 #else
258     return fanDefinitions;
259 #endif
260 }
261 
262 void System::setFans(const std::vector<FanDefinition>& fanDefs)
263 {
264     for (const auto& fanDef : fanDefs)
265     {
266         // Check if a condition exists on the fan
267         auto condition = std::get<conditionField>(fanDef);
268         if (condition)
269         {
270             // Condition exists, skip adding fan if it fails
271             if (!(*condition)(_bus))
272             {
273                 continue;
274             }
275         }
276         _fans.emplace_back(
277             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
278 
279         updateFanHealth(*(_fans.back()));
280     }
281 }
282 
283 // callback indicating a service went [on|off]line.
284 // Determine on/offline status, set all sensors for that service
285 // to new state
286 //
287 void System::tachSignalOffline(sdbusplus::message_t& msg,
288                                SensorMapType const& sensorMap)
289 {
290     std::string serviceName, oldOwner, newOwner;
291 
292     msg.read(serviceName);
293     msg.read(oldOwner);
294     msg.read(newOwner);
295 
296     // true if sensor server came back online, false -> went offline
297     bool hasOwner = !newOwner.empty() && oldOwner.empty();
298 
299     std::string stateStr(hasOwner ? "online" : "offline");
300     getLogger().log(fmt::format("Changing sensors for service {} to {}",
301                                 serviceName, stateStr),
302                     Logger::info);
303 
304     auto sensorItr(sensorMap.find(serviceName));
305 
306     if (sensorItr != sensorMap.end())
307     {
308         // set all sensors' owner state to not-owned
309         for (auto& sensor : sensorItr->second)
310         {
311             sensor->setOwner(hasOwner);
312             sensor->getFan().process(*sensor);
313         }
314     }
315 }
316 
317 void System::updateFanHealth(const Fan& fan)
318 {
319     std::vector<bool> sensorStatus;
320     for (const auto& sensor : fan.sensors())
321     {
322         sensorStatus.push_back(sensor->functional());
323     }
324 
325     _fanHealth[fan.getName()] =
326         std::make_tuple(fan.present(), std::move(sensorStatus));
327 }
328 
329 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
330 {
331     updateFanHealth(fan);
332 
333     if (_powerState->isPowerOn() && !skipRulesCheck)
334     {
335         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
336                       [this](auto& rule) {
337                           rule->check(PowerRuleState::runtime, _fanHealth);
338                       });
339     }
340 }
341 
342 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
343 {
344 #ifdef MONITOR_USE_JSON
345     std::shared_ptr<PowerInterfaceBase> powerInterface =
346         std::make_shared<PowerInterface>(_thermalAlert);
347 
348     PowerOffAction::PrePowerOffFunc func =
349         std::bind(std::mem_fn(&System::logShutdownError), this);
350 
351     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
352 
353     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
354 #endif
355 }
356 
357 void System::powerStateChanged(bool powerStateOn)
358 {
359     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
360         fan->powerStateChanged(powerStateOn);
361     });
362 
363     if (powerStateOn)
364     {
365         if (!_loaded)
366         {
367             log<level::ERR>("No conf file found at power on");
368             throw std::runtime_error("No conf file found at power on");
369         }
370 
371         // If no fan has its sensors on D-Bus, then there is a problem
372         // with the fan controller.  Log an error and shut down.
373         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
374                 return fan->numSensorsOnDBusAtPowerOn() == 0;
375             }))
376         {
377             handleOfflineFanController();
378             return;
379         }
380 
381         if (_sensorMatch.empty())
382         {
383             subscribeSensorsToServices();
384         }
385 
386         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
387                       [this](auto& rule) {
388                           rule->check(PowerRuleState::atPgood, _fanHealth);
389                       });
390         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
391                       [this](auto& rule) {
392                           rule->check(PowerRuleState::runtime, _fanHealth);
393                       });
394     }
395     else
396     {
397         _thermalAlert.enabled(false);
398 
399         // Cancel any in-progress power off actions
400         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
401                       [this](auto& rule) { rule->cancel(); });
402     }
403 }
404 
405 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
406 {
407     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
408 
409     getLogger().log(
410         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
411                     sensor.name()),
412         Logger::error);
413 
414     // In order to know if the event log should have a severity of error or
415     // informational, count the number of existing nonfunctional sensors and
416     // compare it to _numNonfuncSensorsBeforeError.
417     size_t nonfuncSensors = 0;
418     for (const auto& fan : _fans)
419     {
420         for (const auto& s : fan->sensors())
421         {
422             // Don't count nonfunctional sensors that still have their
423             // error timer running as nonfunctional since they haven't
424             // had event logs created for those errors yet.
425             if (!s->functional() && !s->errorTimerRunning())
426             {
427                 nonfuncSensors++;
428             }
429         }
430     }
431 
432     Severity severity = Severity::Error;
433     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
434     {
435         severity = Severity::Informational;
436     }
437 
438     auto error =
439         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
440                                    fanPath, sensor.name(), severity);
441 
442     auto sensorData = captureSensorData();
443     error->commit(sensorData);
444 
445     // Save the error so it can be committed again on a power off.
446     _lastError = std::move(error);
447 }
448 
449 void System::fanMissingErrorTimerExpired(const Fan& fan)
450 {
451     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
452 
453     getLogger().log(
454         fmt::format("Creating event log for missing fan {}", fanPath),
455         Logger::error);
456 
457     auto error = std::make_unique<FanError>(
458         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
459 
460     auto sensorData = captureSensorData();
461     error->commit(sensorData);
462 
463     // Save the error so it can be committed again on a power off.
464     _lastError = std::move(error);
465 }
466 
467 void System::logShutdownError()
468 {
469     if (_lastError)
470     {
471         getLogger().log("Re-committing previous fan error before power off");
472 
473         // Still use the latest sensor data
474         auto sensorData = captureSensorData();
475         _lastError->commit(sensorData, true);
476     }
477 }
478 
479 json System::captureSensorData()
480 {
481     json data;
482 
483     for (const auto& fan : _fans)
484     {
485         for (const auto& sensor : fan->sensors())
486         {
487             json values;
488             values["present"] = fan->present();
489             values["functional"] = sensor->functional();
490             values["tach"] = sensor->getInput();
491 
492             if (sensor->hasTarget())
493             {
494                 values["target"] = sensor->getTarget();
495             }
496 
497             // convert between string/json to remove newlines
498             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
499 
500             if (sensor->hasTarget())
501             {
502                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
503             }
504 
505             if (sensor->getMethod() == MethodMode::count)
506             {
507                 values["ticks"] = sensor->getCounter();
508             }
509             data["sensors"][sensor->name()] = values;
510         }
511     }
512 
513     return data;
514 }
515 
516 void System::handleOfflineFanController()
517 {
518     getLogger().log("The fan controller appears to be offline.  Shutting down.",
519                     Logger::error);
520 
521     auto ffdc = collectHwmonFFDC();
522 
523     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
524                    Severity::Critical};
525     error.commit(ffdc, true);
526 
527     PowerInterface::executeHardPowerOff();
528 
529     createBmcDump();
530 }
531 
532 /**
533  * @brief Create a BMC Dump
534  */
535 void System::createBmcDump() const
536 {
537     try
538     {
539         util::SDBusPlus::callMethod(
540             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
541             "xyz.openbmc_project.Dump.Create", "CreateDump",
542             std::vector<
543                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
544     }
545     catch (const std::exception& e)
546     {
547         getLogger().log(
548             fmt::format("Caught exception while creating BMC dump: {}",
549                         e.what()),
550             Logger::error);
551     }
552 }
553 
554 } // namespace phosphor::fan::monitor
555