xref: /openbmc/phosphor-fan-presence/monitor/system.cpp (revision 78a48a55bbb4223fa2004be04469599d57998192)
1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "fan.hpp"
19 #include "fan_defs.hpp"
20 #include "tach_sensor.hpp"
21 #include "trust_manager.hpp"
22 #include "types.hpp"
23 #include "utility.hpp"
24 #ifdef MONITOR_USE_JSON
25 #include "json_config.hpp"
26 #include "json_parser.hpp"
27 #endif
28 
29 #include "config.h"
30 
31 #include "hwmon_ffdc.hpp"
32 
33 #include <nlohmann/json.hpp>
34 #include <phosphor-logging/log.hpp>
35 #include <sdbusplus/bus.hpp>
36 #include <sdeventplus/event.hpp>
37 #include <sdeventplus/source/signal.hpp>
38 
39 namespace phosphor::fan::monitor
40 {
41 
42 using json = nlohmann::json;
43 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
44 
45 using namespace phosphor::logging;
46 
47 System::System(Mode mode, sdbusplus::bus::bus& bus,
48                const sdeventplus::Event& event) :
49     _mode(mode),
50     _bus(bus), _event(event),
51     _powerState(std::make_unique<PGoodState>(
52         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
53                        std::placeholders::_1))),
54     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
55 {}
56 
57 void System::start()
58 {
59     namespace match = sdbusplus::bus::match;
60 
61     // must be done before service detection
62     _inventoryMatch = std::make_unique<match::match>(
63         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
64         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
65 
66     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
67         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
68         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
69 
70     if (invServiceRunning)
71     {
72         _inventoryMatch.reset();
73 
74         if (!_loaded)
75         {
76             load();
77         }
78     }
79 }
80 
81 void System::load()
82 {
83     json jsonObj = json::object();
84 #ifdef MONITOR_USE_JSON
85     try
86     {
87         jsonObj = getJsonObj();
88 #endif
89         auto trustGrps = getTrustGroups(jsonObj);
90         auto fanDefs = getFanDefinitions(jsonObj);
91         // Retrieve and set trust groups within the trust manager
92         setTrustMgr(getTrustGroups(jsonObj));
93         // Clear/set configured fan definitions
94         _fans.clear();
95         _fanHealth.clear();
96         // Retrieve fan definitions and create fan objects to be monitored
97         setFans(fanDefs);
98         setFaultConfig(jsonObj);
99         log<level::INFO>("Configuration loaded");
100 
101         _loaded = true;
102 #ifdef MONITOR_USE_JSON
103     }
104     catch (const phosphor::fan::NoConfigFound&)
105     {}
106 #endif
107 
108     if (_powerState->isPowerOn())
109     {
110         // Fans could be missing on startup, so check the power off rules.
111         // Tach sensors default to functional, so they wouldn't cause a power
112         // off here.
113         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
114                       [this](auto& rule) {
115                           rule->check(PowerRuleState::runtime, _fanHealth);
116                       });
117     }
118 
119     subscribeSensorsToServices();
120 }
121 
122 void System::subscribeSensorsToServices()
123 {
124     namespace match = sdbusplus::bus::match;
125 
126     _sensorMatch.clear();
127 
128     SensorMapType sensorMap;
129 
130     // build a list of all interfaces, always including the value interface
131     // using set automatically guards against duplicates
132     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
133 
134     for (const auto& fan : _fans)
135     {
136         for (const auto& sensor : fan->sensors())
137         {
138             unique_interfaces.insert(sensor->getInterface());
139         }
140     }
141     // convert them to vector to pass into getSubTreeRaw
142     std::vector<std::string> interfaces(unique_interfaces.begin(),
143                                         unique_interfaces.end());
144 
145     try
146     {
147         // get service information for all service names that are
148         // hosting these interfaces
149         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
150             _bus, FAN_SENSOR_PATH, interfaces, 0);
151 
152         for (const auto& fan : _fans)
153         {
154             // For every sensor in each fan
155             for (const auto& sensor : fan->sensors())
156             {
157                 const auto itServ = serviceObjects.find(sensor->name());
158 
159                 if (serviceObjects.end() == itServ || itServ->second.empty())
160                 {
161                     getLogger().log(
162                         fmt::format("Fan sensor entry {} not found in D-Bus",
163                                     sensor->name()),
164                         Logger::error);
165                     continue;
166                 }
167 
168                 for (const auto& [serviceName, unused] : itServ->second)
169                 {
170                     // associate service name with sensor
171                     sensorMap[serviceName].insert(sensor);
172                 }
173             }
174         }
175 
176         // only create 1 match per service
177         for (const auto& [serviceName, unused] : sensorMap)
178         {
179             // map its service name to the sensor
180             _sensorMatch.emplace_back(std::make_unique<match::match>(
181                 _bus, match::rules::nameOwnerChanged(serviceName),
182                 std::bind(&System::tachSignalOffline, this,
183                           std::placeholders::_1, sensorMap)));
184         }
185     }
186     catch (const util::DBusError&)
187     {
188         // catch exception from getSubTreeRaw() when fan sensor paths don't
189         // exist yet
190     }
191 }
192 
193 void System::inventoryOnlineCb(sdbusplus::message::message& msg)
194 {
195     namespace match = sdbusplus::bus::match;
196 
197     std::string iface;
198     msg.read(iface);
199 
200     if (util::INVENTORY_INTF != iface)
201     {
202         return;
203     }
204 
205     std::string oldName;
206     msg.read(oldName);
207 
208     std::string newName;
209     msg.read(newName);
210 
211     // newName should never be empty since match was reset on the first
212     // nameOwnerChanged signal received from the service.
213     if (!_loaded && !newName.empty())
214     {
215         load();
216     }
217 
218     // cancel any further notifications about the service state
219     _inventoryMatch.reset();
220 }
221 
222 void System::sighupHandler(sdeventplus::source::Signal&,
223                            const struct signalfd_siginfo*)
224 {
225     try
226     {
227         load();
228     }
229     catch (std::runtime_error& re)
230     {
231         log<level::ERR>("Error reloading config, no config changes made",
232                         entry("LOAD_ERROR=%s", re.what()));
233     }
234 }
235 
236 const std::vector<CreateGroupFunction>
237     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
238 {
239 #ifdef MONITOR_USE_JSON
240     return getTrustGrps(jsonObj);
241 #else
242     return trustGroups;
243 #endif
244 }
245 
246 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
247 {
248     _trust = std::make_unique<trust::Manager>(groupFuncs);
249 }
250 
251 const std::vector<FanDefinition>
252     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
253 {
254 #ifdef MONITOR_USE_JSON
255     return getFanDefs(jsonObj);
256 #else
257     return fanDefinitions;
258 #endif
259 }
260 
261 void System::setFans(const std::vector<FanDefinition>& fanDefs)
262 {
263     for (const auto& fanDef : fanDefs)
264     {
265         // Check if a condition exists on the fan
266         auto condition = std::get<conditionField>(fanDef);
267         if (condition)
268         {
269             // Condition exists, skip adding fan if it fails
270             if (!(*condition)(_bus))
271             {
272                 continue;
273             }
274         }
275         _fans.emplace_back(
276             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
277 
278         updateFanHealth(*(_fans.back()));
279     }
280 }
281 
282 // callback indicating a service went [on|off]line.
283 // Determine on/offline status, set all sensors for that service
284 // to new state
285 //
286 void System::tachSignalOffline(sdbusplus::message::message& msg,
287                                SensorMapType const& sensorMap)
288 {
289     std::string serviceName, oldOwner, newOwner;
290 
291     msg.read(serviceName);
292     msg.read(oldOwner);
293     msg.read(newOwner);
294 
295     // true if sensor server came back online, false -> went offline
296     bool hasOwner = !newOwner.empty() && oldOwner.empty();
297 
298     std::string stateStr(hasOwner ? "online" : "offline");
299     getLogger().log(fmt::format("Changing sensors for service {} to {}",
300                                 serviceName, stateStr),
301                     Logger::info);
302 
303     auto sensorItr(sensorMap.find(serviceName));
304 
305     if (sensorItr != sensorMap.end())
306     {
307         // set all sensors' owner state to not-owned
308         for (auto& sensor : sensorItr->second)
309         {
310             sensor->setOwner(hasOwner);
311             sensor->getFan().process(*sensor);
312         }
313     }
314 }
315 
316 void System::updateFanHealth(const Fan& fan)
317 {
318     std::vector<bool> sensorStatus;
319     for (const auto& sensor : fan.sensors())
320     {
321         sensorStatus.push_back(sensor->functional());
322     }
323 
324     _fanHealth[fan.getName()] =
325         std::make_tuple(fan.present(), std::move(sensorStatus));
326 }
327 
328 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
329 {
330     updateFanHealth(fan);
331 
332     if (_powerState->isPowerOn() && !skipRulesCheck)
333     {
334         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
335                       [this](auto& rule) {
336                           rule->check(PowerRuleState::runtime, _fanHealth);
337                       });
338     }
339 }
340 
341 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
342 {
343 #ifdef MONITOR_USE_JSON
344     std::shared_ptr<PowerInterfaceBase> powerInterface =
345         std::make_shared<PowerInterface>(_thermalAlert);
346 
347     PowerOffAction::PrePowerOffFunc func =
348         std::bind(std::mem_fn(&System::logShutdownError), this);
349 
350     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
351 
352     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
353 #endif
354 }
355 
356 void System::powerStateChanged(bool powerStateOn)
357 {
358     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
359         fan->powerStateChanged(powerStateOn);
360     });
361 
362     if (powerStateOn)
363     {
364         if (!_loaded)
365         {
366             log<level::ERR>("No conf file found at power on");
367             throw std::runtime_error("No conf file found at power on");
368         }
369 
370         // If no fan has its sensors on D-Bus, then there is a problem
371         // with the fan controller.  Log an error and shut down.
372         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
373                 return fan->numSensorsOnDBusAtPowerOn() == 0;
374             }))
375         {
376             handleOfflineFanController();
377             return;
378         }
379 
380         if (_sensorMatch.empty())
381         {
382             subscribeSensorsToServices();
383         }
384 
385         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
386                       [this](auto& rule) {
387                           rule->check(PowerRuleState::atPgood, _fanHealth);
388                       });
389         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
390                       [this](auto& rule) {
391                           rule->check(PowerRuleState::runtime, _fanHealth);
392                       });
393     }
394     else
395     {
396         _thermalAlert.enabled(false);
397 
398         // Cancel any in-progress power off actions
399         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
400                       [this](auto& rule) { rule->cancel(); });
401     }
402 }
403 
404 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
405 {
406     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
407 
408     getLogger().log(
409         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
410                     sensor.name()),
411         Logger::error);
412 
413     // In order to know if the event log should have a severity of error or
414     // informational, count the number of existing nonfunctional sensors and
415     // compare it to _numNonfuncSensorsBeforeError.
416     size_t nonfuncSensors = 0;
417     for (const auto& fan : _fans)
418     {
419         for (const auto& s : fan->sensors())
420         {
421             // Don't count nonfunctional sensors that still have their
422             // error timer running as nonfunctional since they haven't
423             // had event logs created for those errors yet.
424             if (!s->functional() && !s->errorTimerRunning())
425             {
426                 nonfuncSensors++;
427             }
428         }
429     }
430 
431     Severity severity = Severity::Error;
432     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
433     {
434         severity = Severity::Informational;
435     }
436 
437     auto error =
438         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
439                                    fanPath, sensor.name(), severity);
440 
441     auto sensorData = captureSensorData();
442     error->commit(sensorData);
443 
444     // Save the error so it can be committed again on a power off.
445     _lastError = std::move(error);
446 }
447 
448 void System::fanMissingErrorTimerExpired(const Fan& fan)
449 {
450     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
451 
452     getLogger().log(
453         fmt::format("Creating event log for missing fan {}", fanPath),
454         Logger::error);
455 
456     auto error = std::make_unique<FanError>(
457         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
458 
459     auto sensorData = captureSensorData();
460     error->commit(sensorData);
461 
462     // Save the error so it can be committed again on a power off.
463     _lastError = std::move(error);
464 }
465 
466 void System::logShutdownError()
467 {
468     if (_lastError)
469     {
470         getLogger().log("Re-committing previous fan error before power off");
471 
472         // Still use the latest sensor data
473         auto sensorData = captureSensorData();
474         _lastError->commit(sensorData, true);
475     }
476 }
477 
478 json System::captureSensorData()
479 {
480     json data;
481 
482     for (const auto& fan : _fans)
483     {
484         for (const auto& sensor : fan->sensors())
485         {
486             json values;
487             values["present"] = fan->present();
488             values["functional"] = sensor->functional();
489             values["tach"] = sensor->getInput();
490 
491             if (sensor->hasTarget())
492             {
493                 values["target"] = sensor->getTarget();
494             }
495 
496             // convert between string/json to remove newlines
497             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
498 
499             if (sensor->hasTarget())
500             {
501                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
502             }
503 
504             data["sensors"][sensor->name()] = values;
505         }
506     }
507 
508     return data;
509 }
510 
511 void System::handleOfflineFanController()
512 {
513     getLogger().log("The fan controller appears to be offline.  Shutting down.",
514                     Logger::error);
515 
516     auto ffdc = collectHwmonFFDC();
517 
518     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
519                    Severity::Critical};
520     error.commit(ffdc, true);
521 
522     PowerInterface::executeHardPowerOff();
523 
524     createBmcDump();
525 }
526 
527 /**
528  * @brief Create a BMC Dump
529  */
530 void System::createBmcDump() const
531 {
532     try
533     {
534         util::SDBusPlus::callMethod(
535             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
536             "xyz.openbmc_project.Dump.Create", "CreateDump",
537             std::vector<
538                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
539     }
540     catch (const std::exception& e)
541     {
542         getLogger().log(
543             fmt::format("Caught exception while creating BMC dump: {}",
544                         e.what()),
545             Logger::error);
546     }
547 }
548 
549 } // namespace phosphor::fan::monitor
550