1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29 
30 #include "config.h"
31 
32 #include "hwmon_ffdc.hpp"
33 
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40 
41 namespace phosphor::fan::monitor
42 {
43 
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46 
47 using namespace phosphor::logging;
48 
49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
50 
System(Mode mode,sdbusplus::bus_t & bus,const sdeventplus::Event & event)51 System::System(Mode mode, sdbusplus::bus_t& bus,
52                const sdeventplus::Event& event) :
53     _mode(mode),
54     _bus(bus), _event(event),
55 #ifdef MONITOR_USE_HOST_STATE
56     _powerState(std::make_unique<HostPowerState>(
57 #else
58     _powerState(std::make_unique<PGoodState>(
59 #endif
60         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
61                        std::placeholders::_1))),
62     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
63 {}
64 
start()65 void System::start()
66 {
67     namespace match = sdbusplus::bus::match;
68 
69     // must be done before service detection
70     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
71         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
72         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
73 
74     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
75         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
76         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
77 
78     if (invServiceRunning)
79     {
80         _inventoryMatch.reset();
81 
82         if (!_loaded)
83         {
84             load();
85         }
86     }
87 }
88 
load()89 void System::load()
90 {
91     json jsonObj = json::object();
92 #ifdef MONITOR_USE_JSON
93     try
94     {
95         jsonObj = getJsonObj();
96 #endif
97         auto trustGrps = getTrustGroups(jsonObj);
98         auto fanDefs = getFanDefinitions(jsonObj);
99         // Retrieve and set trust groups within the trust manager
100         setTrustMgr(getTrustGroups(jsonObj));
101         // Clear/set configured fan definitions
102         _fans.clear();
103         _fanHealth.clear();
104         // Retrieve fan definitions and create fan objects to be monitored
105         setFans(fanDefs);
106         setFaultConfig(jsonObj);
107         log<level::INFO>("Configuration loaded");
108 
109         _loaded = true;
110 #ifdef MONITOR_USE_JSON
111     }
112     catch (const phosphor::fan::NoConfigFound&)
113     {}
114 #endif
115 
116     if (_powerState->isPowerOn())
117     {
118         // Fans could be missing on startup, so check the power off rules.
119         // Tach sensors default to functional, so they wouldn't cause a power
120         // off here.
121         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
122                       [this](auto& rule) {
123             rule->check(PowerRuleState::runtime, _fanHealth);
124         });
125     }
126 
127     subscribeSensorsToServices();
128 }
129 
subscribeSensorsToServices()130 void System::subscribeSensorsToServices()
131 {
132     namespace match = sdbusplus::bus::match;
133 
134     _sensorMatch.clear();
135 
136     SensorMapType sensorMap;
137 
138     // build a list of all interfaces, always including the value interface
139     // using set automatically guards against duplicates
140     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
141 
142     for (const auto& fan : _fans)
143     {
144         for (const auto& sensor : fan->sensors())
145         {
146             unique_interfaces.insert(sensor->getInterface());
147         }
148     }
149     // convert them to vector to pass into getSubTreeRaw
150     std::vector<std::string> interfaces(unique_interfaces.begin(),
151                                         unique_interfaces.end());
152 
153     try
154     {
155         // get service information for all service names that are
156         // hosting these interfaces
157         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
158             _bus, FAN_SENSOR_PATH, interfaces, 0);
159 
160         for (const auto& fan : _fans)
161         {
162             // For every sensor in each fan
163             for (const auto& sensor : fan->sensors())
164             {
165                 const auto itServ = serviceObjects.find(sensor->name());
166 
167                 if (serviceObjects.end() == itServ || itServ->second.empty())
168                 {
169                     getLogger().log(
170                         std::format("Fan sensor entry {} not found in D-Bus",
171                                     sensor->name()),
172                         Logger::error);
173                     continue;
174                 }
175 
176                 for (const auto& [serviceName, unused] : itServ->second)
177                 {
178                     // associate service name with sensor
179                     sensorMap[serviceName].insert(sensor);
180                 }
181             }
182         }
183 
184         // only create 1 match per service
185         for (const auto& [serviceName, unused] : sensorMap)
186         {
187             // map its service name to the sensor
188             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
189                 _bus, match::rules::nameOwnerChanged(serviceName),
190                 std::bind(&System::tachSignalOffline, this,
191                           std::placeholders::_1, sensorMap)));
192         }
193     }
194     catch (const util::DBusError&)
195     {
196         // catch exception from getSubTreeRaw() when fan sensor paths don't
197         // exist yet
198     }
199 }
200 
inventoryOnlineCb(sdbusplus::message_t & msg)201 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
202 {
203     namespace match = sdbusplus::bus::match;
204 
205     std::string iface;
206     msg.read(iface);
207 
208     if (util::INVENTORY_INTF != iface)
209     {
210         return;
211     }
212 
213     std::string oldName;
214     msg.read(oldName);
215 
216     std::string newName;
217     msg.read(newName);
218 
219     // newName should never be empty since match was reset on the first
220     // nameOwnerChanged signal received from the service.
221     if (!_loaded && !newName.empty())
222     {
223         load();
224     }
225 
226     // cancel any further notifications about the service state
227     _inventoryMatch.reset();
228 }
229 
sighupHandler(sdeventplus::source::Signal &,const struct signalfd_siginfo *)230 void System::sighupHandler(sdeventplus::source::Signal&,
231                            const struct signalfd_siginfo*)
232 {
233     try
234     {
235         load();
236     }
237     catch (std::runtime_error& re)
238     {
239         log<level::ERR>("Error reloading config, no config changes made",
240                         entry("LOAD_ERROR=%s", re.what()));
241     }
242 }
243 
244 const std::vector<CreateGroupFunction>
getTrustGroups(const json & jsonObj)245     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
246 {
247 #ifdef MONITOR_USE_JSON
248     return getTrustGrps(jsonObj);
249 #else
250     return trustGroups;
251 #endif
252 }
253 
setTrustMgr(const std::vector<CreateGroupFunction> & groupFuncs)254 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
255 {
256     _trust = std::make_unique<trust::Manager>(groupFuncs);
257 }
258 
259 const std::vector<FanDefinition>
getFanDefinitions(const json & jsonObj)260     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
261 {
262 #ifdef MONITOR_USE_JSON
263     return getFanDefs(jsonObj);
264 #else
265     return fanDefinitions;
266 #endif
267 }
268 
setFans(const std::vector<FanDefinition> & fanDefs)269 void System::setFans(const std::vector<FanDefinition>& fanDefs)
270 {
271     for (const auto& fanDef : fanDefs)
272     {
273         // Check if a condition exists on the fan
274         auto condition = fanDef.condition;
275         if (condition)
276         {
277             // Condition exists, skip adding fan if it fails
278             if (!(*condition)(_bus))
279             {
280                 continue;
281             }
282         }
283         _fans.emplace_back(
284             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
285 
286         updateFanHealth(*(_fans.back()));
287     }
288 }
289 
290 // callback indicating a service went [on|off]line.
291 // Determine on/offline status, set all sensors for that service
292 // to new state
293 //
tachSignalOffline(sdbusplus::message_t & msg,const SensorMapType & sensorMap)294 void System::tachSignalOffline(sdbusplus::message_t& msg,
295                                const SensorMapType& sensorMap)
296 {
297     std::string serviceName, oldOwner, newOwner;
298 
299     msg.read(serviceName);
300     msg.read(oldOwner);
301     msg.read(newOwner);
302 
303     // true if sensor server came back online, false -> went offline
304     bool hasOwner = !newOwner.empty() && oldOwner.empty();
305 
306     std::string stateStr(hasOwner ? "online" : "offline");
307     getLogger().log(std::format("Changing sensors for service {} to {}",
308                                 serviceName, stateStr),
309                     Logger::info);
310 
311     auto sensorItr(sensorMap.find(serviceName));
312 
313     if (sensorItr != sensorMap.end())
314     {
315         // set all sensors' owner state to not-owned
316         for (auto& sensor : sensorItr->second)
317         {
318             sensor->setOwner(hasOwner);
319             sensor->getFan().process(*sensor);
320         }
321     }
322 }
323 
updateFanHealth(const Fan & fan)324 void System::updateFanHealth(const Fan& fan)
325 {
326     std::vector<bool> sensorStatus;
327     for (const auto& sensor : fan.sensors())
328     {
329         sensorStatus.push_back(sensor->functional());
330     }
331 
332     _fanHealth[fan.getName()] = std::make_tuple(fan.present(),
333                                                 std::move(sensorStatus));
334 }
335 
fanStatusChange(const Fan & fan,bool skipRulesCheck)336 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
337 {
338     updateFanHealth(fan);
339 
340     if (_powerState->isPowerOn() && !skipRulesCheck)
341     {
342         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
343                       [this](auto& rule) {
344             rule->check(PowerRuleState::runtime, _fanHealth);
345         });
346     }
347 }
348 
setFaultConfig(const json & jsonObj)349 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
350 {
351 #ifdef MONITOR_USE_JSON
352     std::shared_ptr<PowerInterfaceBase> powerInterface =
353         std::make_shared<PowerInterface>(_thermalAlert);
354 
355     PowerOffAction::PrePowerOffFunc func =
356         std::bind(std::mem_fn(&System::logShutdownError), this);
357 
358     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
359 
360     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
361 #endif
362 }
363 
powerStateChanged(bool powerStateOn)364 void System::powerStateChanged(bool powerStateOn)
365 {
366     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
367         fan->powerStateChanged(powerStateOn);
368     });
369 
370     if (powerStateOn)
371     {
372         if (!_loaded)
373         {
374             log<level::ERR>("No conf file found at power on");
375             throw std::runtime_error("No conf file found at power on");
376         }
377 
378         // If no fan has its sensors on D-Bus, then there is a problem
379         // with the fan controller.  Log an error and shut down.
380         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
381             return fan->numSensorsOnDBusAtPowerOn() == 0;
382         }))
383         {
384 #if DELAY_HOST_CONTROL > 0
385             sleep(DELAY_HOST_CONTROL);
386             std::for_each(_fans.begin(), _fans.end(),
387                           [powerStateOn](auto& fan) {
388                 fan->powerStateChanged(powerStateOn);
389             });
390             if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
391                 return fan->numSensorsOnDBusAtPowerOn() == 0;
392             }))
393             {
394                 handleOfflineFanController();
395                 return;
396             }
397 #else
398             handleOfflineFanController();
399             return;
400 #endif
401         }
402 
403         if (_sensorMatch.empty())
404         {
405             subscribeSensorsToServices();
406         }
407 
408         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
409                       [this](auto& rule) {
410             rule->check(PowerRuleState::atPgood, _fanHealth);
411         });
412         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
413                       [this](auto& rule) {
414             rule->check(PowerRuleState::runtime, _fanHealth);
415         });
416     }
417     else
418     {
419         _thermalAlert.enabled(false);
420 
421         // Cancel any in-progress power off actions
422         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
423                       [this](auto& rule) { rule->cancel(); });
424     }
425 }
426 
sensorErrorTimerExpired(const Fan & fan,const TachSensor & sensor)427 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
428 {
429     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
430 
431     getLogger().log(
432         std::format("Creating event log for faulted fan {} sensor {}", fanPath,
433                     sensor.name()),
434         Logger::error);
435 
436     // In order to know if the event log should have a severity of error or
437     // informational, count the number of existing nonfunctional sensors and
438     // compare it to _numNonfuncSensorsBeforeError.
439     size_t nonfuncSensors = 0;
440     for (const auto& fan : _fans)
441     {
442         for (const auto& s : fan->sensors())
443         {
444             // Don't count nonfunctional sensors that still have their
445             // error timer running as nonfunctional since they haven't
446             // had event logs created for those errors yet.
447             if (!s->functional() && !s->errorTimerRunning())
448             {
449                 nonfuncSensors++;
450             }
451         }
452     }
453 
454     Severity severity = Severity::Error;
455     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
456     {
457         severity = Severity::Informational;
458     }
459 
460     auto error =
461         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
462                                    fanPath, sensor.name(), severity);
463 
464     auto sensorData = captureSensorData();
465     error->commit(sensorData);
466 
467     // Save the error so it can be committed again on a power off.
468     _lastError = std::move(error);
469 }
470 
fanMissingErrorTimerExpired(const Fan & fan)471 void System::fanMissingErrorTimerExpired(const Fan& fan)
472 {
473     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
474 
475     getLogger().log(
476         std::format("Creating event log for missing fan {}", fanPath),
477         Logger::error);
478 
479     auto error = std::make_unique<FanError>(
480         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
481 
482     auto sensorData = captureSensorData();
483     error->commit(sensorData);
484 
485     // Save the error so it can be committed again on a power off.
486     _lastError = std::move(error);
487 }
488 
logShutdownError()489 void System::logShutdownError()
490 {
491     if (_lastError)
492     {
493         getLogger().log("Re-committing previous fan error before power off");
494 
495         // Still use the latest sensor data
496         auto sensorData = captureSensorData();
497         _lastError->commit(sensorData, true);
498     }
499 }
500 
captureSensorData()501 json System::captureSensorData()
502 {
503     json data;
504 
505     for (const auto& fan : _fans)
506     {
507         for (const auto& sensor : fan->sensors())
508         {
509             json values;
510             values["present"] = fan->present();
511             values["functional"] = sensor->functional();
512             values["in_range"] = !fan->outOfRange(*sensor);
513             values["tach"] = sensor->getInput();
514 
515             if (sensor->hasTarget())
516             {
517                 values["target"] = sensor->getTarget();
518             }
519 
520             // convert between string/json to remove newlines
521             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
522 
523             if (sensor->hasTarget())
524             {
525                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
526             }
527 
528             if (sensor->getMethod() == MethodMode::count)
529             {
530                 values["ticks"] = sensor->getCounter();
531             }
532             data["sensors"][sensor->name()] = values;
533         }
534     }
535 
536     return data;
537 }
538 
handleOfflineFanController()539 void System::handleOfflineFanController()
540 {
541     getLogger().log("The fan controller appears to be offline.  Shutting down.",
542                     Logger::error);
543 
544     auto ffdc = collectHwmonFFDC();
545 
546     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
547                    Severity::Critical};
548     error.commit(ffdc, true);
549 
550     PowerInterface::executeHardPowerOff();
551 
552     createBmcDump();
553 }
554 
555 /**
556  * @brief Create a BMC Dump
557  */
createBmcDump() const558 void System::createBmcDump() const
559 {
560     try
561     {
562         util::SDBusPlus::callMethod(
563             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
564             "xyz.openbmc_project.Dump.Create", "CreateDump",
565             std::vector<
566                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
567     }
568     catch (const std::exception& e)
569     {
570         getLogger().log(
571             std::format("Caught exception while creating BMC dump: {}",
572                         e.what()),
573             Logger::error);
574     }
575 }
576 
dumpDebugData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)577 void System::dumpDebugData(sdeventplus::source::Signal&,
578                            const struct signalfd_siginfo*)
579 {
580     json output;
581 
582     if (_loaded)
583     {
584         output["logs"] = getLogger().getLogs();
585         output["sensors"] = captureSensorData();
586     }
587     else
588     {
589         output["error"] = "Fan monitor not loaded yet.  Try again later.";
590     }
591 
592     std::ofstream file{System::dumpFile};
593     if (!file)
594     {
595         log<level::ERR>("Could not open file for fan monitor dump");
596     }
597     else
598     {
599         file << std::setw(4) << output;
600     }
601 }
602 
603 } // namespace phosphor::fan::monitor
604