1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29 
30 #include "config.h"
31 
32 #include "hwmon_ffdc.hpp"
33 
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40 
41 namespace phosphor::fan::monitor
42 {
43 
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46 
47 using namespace phosphor::logging;
48 
49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
50 
System(Mode mode,sdbusplus::bus_t & bus,const sdeventplus::Event & event)51 System::System(Mode mode, sdbusplus::bus_t& bus,
52                const sdeventplus::Event& event) :
53     _mode(mode), _bus(bus), _event(event),
54 #ifdef MONITOR_USE_HOST_STATE
55     _powerState(std::make_unique<HostPowerState>(
56 #else
57     _powerState(std::make_unique<PGoodState>(
58 #endif
59         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
60                        std::placeholders::_1))),
61     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
62 {}
63 
start()64 void System::start()
65 {
66     namespace match = sdbusplus::bus::match;
67 
68     // must be done before service detection
69     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
70         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
71         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
72 
73     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
74         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
75         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
76 
77     if (invServiceRunning)
78     {
79         _inventoryMatch.reset();
80 
81         if (!_loaded)
82         {
83             load();
84         }
85     }
86 }
87 
load()88 void System::load()
89 {
90     json jsonObj = json::object();
91 #ifdef MONITOR_USE_JSON
92     try
93     {
94         jsonObj = getJsonObj();
95 #endif
96         auto trustGrps = getTrustGroups(jsonObj);
97         auto fanDefs = getFanDefinitions(jsonObj);
98         // Retrieve and set trust groups within the trust manager
99         setTrustMgr(getTrustGroups(jsonObj));
100         // Clear/set configured fan definitions
101         _fans.clear();
102         _fanHealth.clear();
103         // Retrieve fan definitions and create fan objects to be monitored
104         setFans(fanDefs);
105         setFaultConfig(jsonObj);
106         log<level::INFO>("Configuration loaded");
107 
108         _loaded = true;
109 #ifdef MONITOR_USE_JSON
110     }
111     catch (const phosphor::fan::NoConfigFound&)
112     {}
113 #endif
114 
115     if (_powerState->isPowerOn())
116     {
117         // Fans could be missing on startup, so check the power off rules.
118         // Tach sensors default to functional, so they wouldn't cause a power
119         // off here.
120         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
121                       [this](auto& rule) {
122                           rule->check(PowerRuleState::runtime, _fanHealth);
123                       });
124     }
125 
126     subscribeSensorsToServices();
127 }
128 
subscribeSensorsToServices()129 void System::subscribeSensorsToServices()
130 {
131     namespace match = sdbusplus::bus::match;
132 
133     _sensorMatch.clear();
134 
135     SensorMapType sensorMap;
136 
137     // build a list of all interfaces, always including the value interface
138     // using set automatically guards against duplicates
139     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
140 
141     for (const auto& fan : _fans)
142     {
143         for (const auto& sensor : fan->sensors())
144         {
145             unique_interfaces.insert(sensor->getInterface());
146         }
147     }
148     // convert them to vector to pass into getSubTreeRaw
149     std::vector<std::string> interfaces(unique_interfaces.begin(),
150                                         unique_interfaces.end());
151 
152     try
153     {
154         // get service information for all service names that are
155         // hosting these interfaces
156         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
157             _bus, FAN_SENSOR_PATH, interfaces, 0);
158 
159         for (const auto& fan : _fans)
160         {
161             // For every sensor in each fan
162             for (const auto& sensor : fan->sensors())
163             {
164                 const auto itServ = serviceObjects.find(sensor->name());
165 
166                 if (serviceObjects.end() == itServ || itServ->second.empty())
167                 {
168                     getLogger().log(
169                         std::format("Fan sensor entry {} not found in D-Bus",
170                                     sensor->name()),
171                         Logger::error);
172                     continue;
173                 }
174 
175                 for (const auto& [serviceName, unused] : itServ->second)
176                 {
177                     // associate service name with sensor
178                     sensorMap[serviceName].insert(sensor);
179                 }
180             }
181         }
182 
183         // only create 1 match per service
184         for (const auto& [serviceName, unused] : sensorMap)
185         {
186             // map its service name to the sensor
187             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
188                 _bus, match::rules::nameOwnerChanged(serviceName),
189                 std::bind(&System::tachSignalOffline, this,
190                           std::placeholders::_1, sensorMap)));
191         }
192     }
193     catch (const util::DBusError&)
194     {
195         // catch exception from getSubTreeRaw() when fan sensor paths don't
196         // exist yet
197     }
198 }
199 
inventoryOnlineCb(sdbusplus::message_t & msg)200 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
201 {
202     namespace match = sdbusplus::bus::match;
203 
204     std::string iface;
205     msg.read(iface);
206 
207     if (util::INVENTORY_INTF != iface)
208     {
209         return;
210     }
211 
212     std::string oldName;
213     msg.read(oldName);
214 
215     std::string newName;
216     msg.read(newName);
217 
218     // newName should never be empty since match was reset on the first
219     // nameOwnerChanged signal received from the service.
220     if (!_loaded && !newName.empty())
221     {
222         load();
223     }
224 
225     // cancel any further notifications about the service state
226     _inventoryMatch.reset();
227 }
228 
sighupHandler(sdeventplus::source::Signal &,const struct signalfd_siginfo *)229 void System::sighupHandler(sdeventplus::source::Signal&,
230                            const struct signalfd_siginfo*)
231 {
232     try
233     {
234         load();
235     }
236     catch (std::runtime_error& re)
237     {
238         log<level::ERR>("Error reloading config, no config changes made",
239                         entry("LOAD_ERROR=%s", re.what()));
240     }
241 }
242 
243 const std::vector<CreateGroupFunction>
getTrustGroups(const json & jsonObj)244     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
245 {
246 #ifdef MONITOR_USE_JSON
247     return getTrustGrps(jsonObj);
248 #else
249     return trustGroups;
250 #endif
251 }
252 
setTrustMgr(const std::vector<CreateGroupFunction> & groupFuncs)253 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
254 {
255     _trust = std::make_unique<trust::Manager>(groupFuncs);
256 }
257 
258 const std::vector<FanDefinition>
getFanDefinitions(const json & jsonObj)259     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
260 {
261 #ifdef MONITOR_USE_JSON
262     return getFanDefs(jsonObj);
263 #else
264     return fanDefinitions;
265 #endif
266 }
267 
setFans(const std::vector<FanDefinition> & fanDefs)268 void System::setFans(const std::vector<FanDefinition>& fanDefs)
269 {
270     for (const auto& fanDef : fanDefs)
271     {
272         // Check if a condition exists on the fan
273         auto condition = fanDef.condition;
274         if (condition)
275         {
276             // Condition exists, skip adding fan if it fails
277             if (!(*condition)(_bus))
278             {
279                 continue;
280             }
281         }
282         _fans.emplace_back(
283             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
284 
285         updateFanHealth(*(_fans.back()));
286     }
287 }
288 
289 // callback indicating a service went [on|off]line.
290 // Determine on/offline status, set all sensors for that service
291 // to new state
292 //
tachSignalOffline(sdbusplus::message_t & msg,const SensorMapType & sensorMap)293 void System::tachSignalOffline(sdbusplus::message_t& msg,
294                                const SensorMapType& sensorMap)
295 {
296     std::string serviceName, oldOwner, newOwner;
297 
298     msg.read(serviceName);
299     msg.read(oldOwner);
300     msg.read(newOwner);
301 
302     // true if sensor server came back online, false -> went offline
303     bool hasOwner = !newOwner.empty() && oldOwner.empty();
304 
305     std::string stateStr(hasOwner ? "online" : "offline");
306     getLogger().log(std::format("Changing sensors for service {} to {}",
307                                 serviceName, stateStr),
308                     Logger::info);
309 
310     auto sensorItr(sensorMap.find(serviceName));
311 
312     if (sensorItr != sensorMap.end())
313     {
314         // set all sensors' owner state to not-owned
315         for (auto& sensor : sensorItr->second)
316         {
317             sensor->setOwner(hasOwner);
318             sensor->getFan().process(*sensor);
319         }
320     }
321 }
322 
updateFanHealth(const Fan & fan)323 void System::updateFanHealth(const Fan& fan)
324 {
325     std::vector<bool> sensorStatus;
326     for (const auto& sensor : fan.sensors())
327     {
328         sensorStatus.push_back(sensor->functional());
329     }
330 
331     _fanHealth[fan.getName()] =
332         std::make_tuple(fan.present(), std::move(sensorStatus));
333 }
334 
fanStatusChange(const Fan & fan,bool skipRulesCheck)335 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
336 {
337     updateFanHealth(fan);
338 
339     if (_powerState->isPowerOn() && !skipRulesCheck)
340     {
341         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
342                       [this](auto& rule) {
343                           rule->check(PowerRuleState::runtime, _fanHealth);
344                       });
345     }
346 }
347 
setFaultConfig(const json & jsonObj)348 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
349 {
350 #ifdef MONITOR_USE_JSON
351     std::shared_ptr<PowerInterfaceBase> powerInterface =
352         std::make_shared<PowerInterface>(_thermalAlert);
353 
354     PowerOffAction::PrePowerOffFunc func =
355         std::bind(std::mem_fn(&System::logShutdownError), this);
356 
357     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
358 
359     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
360 #endif
361 }
362 
powerStateChanged(bool powerStateOn)363 void System::powerStateChanged(bool powerStateOn)
364 {
365     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
366         fan->powerStateChanged(powerStateOn);
367     });
368 
369     if (powerStateOn)
370     {
371         if (!_loaded)
372         {
373             log<level::ERR>("No conf file found at power on");
374             throw std::runtime_error("No conf file found at power on");
375         }
376 
377         // If no fan has its sensors on D-Bus, then there is a problem
378         // with the fan controller.  Log an error and shut down.
379         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
380                 return fan->numSensorsOnDBusAtPowerOn() == 0;
381             }))
382         {
383 #if DELAY_HOST_CONTROL > 0
384             sleep(DELAY_HOST_CONTROL);
385             std::for_each(_fans.begin(), _fans.end(),
386                           [powerStateOn](auto& fan) {
387                               fan->powerStateChanged(powerStateOn);
388                           });
389             if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
390                     return fan->numSensorsOnDBusAtPowerOn() == 0;
391                 }))
392             {
393                 handleOfflineFanController();
394                 return;
395             }
396 #else
397             handleOfflineFanController();
398             return;
399 #endif
400         }
401 
402         if (_sensorMatch.empty())
403         {
404             subscribeSensorsToServices();
405         }
406 
407         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
408                       [this](auto& rule) {
409                           rule->check(PowerRuleState::atPgood, _fanHealth);
410                       });
411         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
412                       [this](auto& rule) {
413                           rule->check(PowerRuleState::runtime, _fanHealth);
414                       });
415     }
416     else
417     {
418         _thermalAlert.enabled(false);
419 
420         // Cancel any in-progress power off actions
421         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
422                       [this](auto& rule) { rule->cancel(); });
423     }
424 }
425 
sensorErrorTimerExpired(const Fan & fan,const TachSensor & sensor)426 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
427 {
428     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
429 
430     getLogger().log(
431         std::format("Creating event log for faulted fan {} sensor {}", fanPath,
432                     sensor.name()),
433         Logger::error);
434 
435     // In order to know if the event log should have a severity of error or
436     // informational, count the number of existing nonfunctional sensors and
437     // compare it to _numNonfuncSensorsBeforeError.
438     size_t nonfuncSensors = 0;
439     for (const auto& fan : _fans)
440     {
441         for (const auto& s : fan->sensors())
442         {
443             // Don't count nonfunctional sensors that still have their
444             // error timer running as nonfunctional since they haven't
445             // had event logs created for those errors yet.
446             if (!s->functional() && !s->errorTimerRunning())
447             {
448                 nonfuncSensors++;
449             }
450         }
451     }
452 
453     Severity severity = Severity::Error;
454     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
455     {
456         severity = Severity::Informational;
457     }
458 
459     auto error =
460         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
461                                    fanPath, sensor.name(), severity);
462 
463     auto sensorData = captureSensorData();
464     error->commit(sensorData);
465 
466     // Save the error so it can be committed again on a power off.
467     _lastError = std::move(error);
468 }
469 
fanMissingErrorTimerExpired(const Fan & fan)470 void System::fanMissingErrorTimerExpired(const Fan& fan)
471 {
472     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
473 
474     getLogger().log(
475         std::format("Creating event log for missing fan {}", fanPath),
476         Logger::error);
477 
478     auto error = std::make_unique<FanError>(
479         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
480 
481     auto sensorData = captureSensorData();
482     error->commit(sensorData);
483 
484     // Save the error so it can be committed again on a power off.
485     _lastError = std::move(error);
486 }
487 
logShutdownError()488 void System::logShutdownError()
489 {
490     if (_lastError)
491     {
492         getLogger().log("Re-committing previous fan error before power off");
493 
494         // Still use the latest sensor data
495         auto sensorData = captureSensorData();
496         _lastError->commit(sensorData, true);
497     }
498 }
499 
captureSensorData()500 json System::captureSensorData()
501 {
502     json data;
503 
504     for (const auto& fan : _fans)
505     {
506         for (const auto& sensor : fan->sensors())
507         {
508             json values;
509             values["present"] = fan->present();
510             values["functional"] = sensor->functional();
511             values["in_range"] = !fan->outOfRange(*sensor);
512             values["tach"] = sensor->getInput();
513 
514             if (sensor->hasTarget())
515             {
516                 values["target"] = sensor->getTarget();
517             }
518 
519             // convert between string/json to remove newlines
520             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
521 
522             if (sensor->hasTarget())
523             {
524                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
525             }
526 
527             if (sensor->getMethod() == MethodMode::count)
528             {
529                 values["ticks"] = sensor->getCounter();
530             }
531             data["sensors"][sensor->name()] = values;
532         }
533     }
534 
535     return data;
536 }
537 
handleOfflineFanController()538 void System::handleOfflineFanController()
539 {
540     getLogger().log("The fan controller appears to be offline.  Shutting down.",
541                     Logger::error);
542 
543     auto ffdc = collectHwmonFFDC();
544 
545     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
546                    Severity::Critical};
547     error.commit(ffdc, true);
548 
549     PowerInterface::executeHardPowerOff();
550 
551     createBmcDump();
552 }
553 
554 /**
555  * @brief Create a BMC Dump
556  */
createBmcDump() const557 void System::createBmcDump() const
558 {
559     try
560     {
561         util::SDBusPlus::callMethod(
562             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
563             "xyz.openbmc_project.Dump.Create", "CreateDump",
564             std::vector<
565                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
566     }
567     catch (const std::exception& e)
568     {
569         getLogger().log(
570             std::format("Caught exception while creating BMC dump: {}",
571                         e.what()),
572             Logger::error);
573     }
574 }
575 
dumpDebugData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)576 void System::dumpDebugData(sdeventplus::source::Signal&,
577                            const struct signalfd_siginfo*)
578 {
579     json output;
580 
581     if (_loaded)
582     {
583         output["logs"] = getLogger().getLogs();
584         output["sensors"] = captureSensorData();
585     }
586     else
587     {
588         output["error"] = "Fan monitor not loaded yet.  Try again later.";
589     }
590 
591     std::ofstream file{System::dumpFile};
592     if (!file)
593     {
594         log<level::ERR>("Could not open file for fan monitor dump");
595     }
596     else
597     {
598         file << std::setw(4) << output;
599     }
600 }
601 
602 } // namespace phosphor::fan::monitor
603