1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29 
30 #include "config.h"
31 
32 #include "hwmon_ffdc.hpp"
33 
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40 
41 namespace phosphor::fan::monitor
42 {
43 
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46 
47 using namespace phosphor::logging;
48 
49 System::System(Mode mode, sdbusplus::bus_t& bus,
50                const sdeventplus::Event& event) :
51     _mode(mode),
52     _bus(bus), _event(event),
53     _powerState(std::make_unique<PGoodState>(
54         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
55                        std::placeholders::_1))),
56     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
57 {}
58 
59 void System::start()
60 {
61     namespace match = sdbusplus::bus::match;
62 
63     // must be done before service detection
64     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
65         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
66         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
67 
68     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
69         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
70         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
71 
72     if (invServiceRunning)
73     {
74         _inventoryMatch.reset();
75 
76         if (!_loaded)
77         {
78             load();
79         }
80     }
81 }
82 
83 void System::load()
84 {
85     json jsonObj = json::object();
86 #ifdef MONITOR_USE_JSON
87     try
88     {
89         jsonObj = getJsonObj();
90 #endif
91         auto trustGrps = getTrustGroups(jsonObj);
92         auto fanDefs = getFanDefinitions(jsonObj);
93         // Retrieve and set trust groups within the trust manager
94         setTrustMgr(getTrustGroups(jsonObj));
95         // Clear/set configured fan definitions
96         _fans.clear();
97         _fanHealth.clear();
98         // Retrieve fan definitions and create fan objects to be monitored
99         setFans(fanDefs);
100         setFaultConfig(jsonObj);
101         log<level::INFO>("Configuration loaded");
102 
103         _loaded = true;
104 #ifdef MONITOR_USE_JSON
105     }
106     catch (const phosphor::fan::NoConfigFound&)
107     {}
108 #endif
109 
110     if (_powerState->isPowerOn())
111     {
112         // Fans could be missing on startup, so check the power off rules.
113         // Tach sensors default to functional, so they wouldn't cause a power
114         // off here.
115         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
116                       [this](auto& rule) {
117                           rule->check(PowerRuleState::runtime, _fanHealth);
118                       });
119     }
120 
121     subscribeSensorsToServices();
122 }
123 
124 void System::subscribeSensorsToServices()
125 {
126     namespace match = sdbusplus::bus::match;
127 
128     _sensorMatch.clear();
129 
130     SensorMapType sensorMap;
131 
132     // build a list of all interfaces, always including the value interface
133     // using set automatically guards against duplicates
134     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
135 
136     for (const auto& fan : _fans)
137     {
138         for (const auto& sensor : fan->sensors())
139         {
140             unique_interfaces.insert(sensor->getInterface());
141         }
142     }
143     // convert them to vector to pass into getSubTreeRaw
144     std::vector<std::string> interfaces(unique_interfaces.begin(),
145                                         unique_interfaces.end());
146 
147     try
148     {
149         // get service information for all service names that are
150         // hosting these interfaces
151         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
152             _bus, FAN_SENSOR_PATH, interfaces, 0);
153 
154         for (const auto& fan : _fans)
155         {
156             // For every sensor in each fan
157             for (const auto& sensor : fan->sensors())
158             {
159                 const auto itServ = serviceObjects.find(sensor->name());
160 
161                 if (serviceObjects.end() == itServ || itServ->second.empty())
162                 {
163                     getLogger().log(
164                         fmt::format("Fan sensor entry {} not found in D-Bus",
165                                     sensor->name()),
166                         Logger::error);
167                     continue;
168                 }
169 
170                 for (const auto& [serviceName, unused] : itServ->second)
171                 {
172                     // associate service name with sensor
173                     sensorMap[serviceName].insert(sensor);
174                 }
175             }
176         }
177 
178         // only create 1 match per service
179         for (const auto& [serviceName, unused] : sensorMap)
180         {
181             // map its service name to the sensor
182             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
183                 _bus, match::rules::nameOwnerChanged(serviceName),
184                 std::bind(&System::tachSignalOffline, this,
185                           std::placeholders::_1, sensorMap)));
186         }
187     }
188     catch (const util::DBusError&)
189     {
190         // catch exception from getSubTreeRaw() when fan sensor paths don't
191         // exist yet
192     }
193 }
194 
195 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
196 {
197     namespace match = sdbusplus::bus::match;
198 
199     std::string iface;
200     msg.read(iface);
201 
202     if (util::INVENTORY_INTF != iface)
203     {
204         return;
205     }
206 
207     std::string oldName;
208     msg.read(oldName);
209 
210     std::string newName;
211     msg.read(newName);
212 
213     // newName should never be empty since match was reset on the first
214     // nameOwnerChanged signal received from the service.
215     if (!_loaded && !newName.empty())
216     {
217         load();
218     }
219 
220     // cancel any further notifications about the service state
221     _inventoryMatch.reset();
222 }
223 
224 void System::sighupHandler(sdeventplus::source::Signal&,
225                            const struct signalfd_siginfo*)
226 {
227     try
228     {
229         load();
230     }
231     catch (std::runtime_error& re)
232     {
233         log<level::ERR>("Error reloading config, no config changes made",
234                         entry("LOAD_ERROR=%s", re.what()));
235     }
236 }
237 
238 const std::vector<CreateGroupFunction>
239     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
240 {
241 #ifdef MONITOR_USE_JSON
242     return getTrustGrps(jsonObj);
243 #else
244     return trustGroups;
245 #endif
246 }
247 
248 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
249 {
250     _trust = std::make_unique<trust::Manager>(groupFuncs);
251 }
252 
253 const std::vector<FanDefinition>
254     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
255 {
256 #ifdef MONITOR_USE_JSON
257     return getFanDefs(jsonObj);
258 #else
259     return fanDefinitions;
260 #endif
261 }
262 
263 void System::setFans(const std::vector<FanDefinition>& fanDefs)
264 {
265     for (const auto& fanDef : fanDefs)
266     {
267         // Check if a condition exists on the fan
268         auto condition = std::get<conditionField>(fanDef);
269         if (condition)
270         {
271             // Condition exists, skip adding fan if it fails
272             if (!(*condition)(_bus))
273             {
274                 continue;
275             }
276         }
277         _fans.emplace_back(
278             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
279 
280         updateFanHealth(*(_fans.back()));
281     }
282 }
283 
284 // callback indicating a service went [on|off]line.
285 // Determine on/offline status, set all sensors for that service
286 // to new state
287 //
288 void System::tachSignalOffline(sdbusplus::message_t& msg,
289                                SensorMapType const& sensorMap)
290 {
291     std::string serviceName, oldOwner, newOwner;
292 
293     msg.read(serviceName);
294     msg.read(oldOwner);
295     msg.read(newOwner);
296 
297     // true if sensor server came back online, false -> went offline
298     bool hasOwner = !newOwner.empty() && oldOwner.empty();
299 
300     std::string stateStr(hasOwner ? "online" : "offline");
301     getLogger().log(fmt::format("Changing sensors for service {} to {}",
302                                 serviceName, stateStr),
303                     Logger::info);
304 
305     auto sensorItr(sensorMap.find(serviceName));
306 
307     if (sensorItr != sensorMap.end())
308     {
309         // set all sensors' owner state to not-owned
310         for (auto& sensor : sensorItr->second)
311         {
312             sensor->setOwner(hasOwner);
313             sensor->getFan().process(*sensor);
314         }
315     }
316 }
317 
318 void System::updateFanHealth(const Fan& fan)
319 {
320     std::vector<bool> sensorStatus;
321     for (const auto& sensor : fan.sensors())
322     {
323         sensorStatus.push_back(sensor->functional());
324     }
325 
326     _fanHealth[fan.getName()] =
327         std::make_tuple(fan.present(), std::move(sensorStatus));
328 }
329 
330 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
331 {
332     updateFanHealth(fan);
333 
334     if (_powerState->isPowerOn() && !skipRulesCheck)
335     {
336         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
337                       [this](auto& rule) {
338                           rule->check(PowerRuleState::runtime, _fanHealth);
339                       });
340     }
341 }
342 
343 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
344 {
345 #ifdef MONITOR_USE_JSON
346     std::shared_ptr<PowerInterfaceBase> powerInterface =
347         std::make_shared<PowerInterface>(_thermalAlert);
348 
349     PowerOffAction::PrePowerOffFunc func =
350         std::bind(std::mem_fn(&System::logShutdownError), this);
351 
352     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
353 
354     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
355 #endif
356 }
357 
358 void System::powerStateChanged(bool powerStateOn)
359 {
360     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
361         fan->powerStateChanged(powerStateOn);
362     });
363 
364     if (powerStateOn)
365     {
366         if (!_loaded)
367         {
368             log<level::ERR>("No conf file found at power on");
369             throw std::runtime_error("No conf file found at power on");
370         }
371 
372         // If no fan has its sensors on D-Bus, then there is a problem
373         // with the fan controller.  Log an error and shut down.
374         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
375                 return fan->numSensorsOnDBusAtPowerOn() == 0;
376             }))
377         {
378             handleOfflineFanController();
379             return;
380         }
381 
382         if (_sensorMatch.empty())
383         {
384             subscribeSensorsToServices();
385         }
386 
387         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
388                       [this](auto& rule) {
389                           rule->check(PowerRuleState::atPgood, _fanHealth);
390                       });
391         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
392                       [this](auto& rule) {
393                           rule->check(PowerRuleState::runtime, _fanHealth);
394                       });
395     }
396     else
397     {
398         _thermalAlert.enabled(false);
399 
400         // Cancel any in-progress power off actions
401         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
402                       [this](auto& rule) { rule->cancel(); });
403     }
404 }
405 
406 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
407 {
408     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
409 
410     getLogger().log(
411         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
412                     sensor.name()),
413         Logger::error);
414 
415     // In order to know if the event log should have a severity of error or
416     // informational, count the number of existing nonfunctional sensors and
417     // compare it to _numNonfuncSensorsBeforeError.
418     size_t nonfuncSensors = 0;
419     for (const auto& fan : _fans)
420     {
421         for (const auto& s : fan->sensors())
422         {
423             // Don't count nonfunctional sensors that still have their
424             // error timer running as nonfunctional since they haven't
425             // had event logs created for those errors yet.
426             if (!s->functional() && !s->errorTimerRunning())
427             {
428                 nonfuncSensors++;
429             }
430         }
431     }
432 
433     Severity severity = Severity::Error;
434     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
435     {
436         severity = Severity::Informational;
437     }
438 
439     auto error =
440         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
441                                    fanPath, sensor.name(), severity);
442 
443     auto sensorData = captureSensorData();
444     error->commit(sensorData);
445 
446     // Save the error so it can be committed again on a power off.
447     _lastError = std::move(error);
448 }
449 
450 void System::fanMissingErrorTimerExpired(const Fan& fan)
451 {
452     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
453 
454     getLogger().log(
455         fmt::format("Creating event log for missing fan {}", fanPath),
456         Logger::error);
457 
458     auto error = std::make_unique<FanError>(
459         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
460 
461     auto sensorData = captureSensorData();
462     error->commit(sensorData);
463 
464     // Save the error so it can be committed again on a power off.
465     _lastError = std::move(error);
466 }
467 
468 void System::logShutdownError()
469 {
470     if (_lastError)
471     {
472         getLogger().log("Re-committing previous fan error before power off");
473 
474         // Still use the latest sensor data
475         auto sensorData = captureSensorData();
476         _lastError->commit(sensorData, true);
477     }
478 }
479 
480 json System::captureSensorData()
481 {
482     json data;
483 
484     for (const auto& fan : _fans)
485     {
486         for (const auto& sensor : fan->sensors())
487         {
488             json values;
489             values["present"] = fan->present();
490             values["functional"] = sensor->functional();
491             values["tach"] = sensor->getInput();
492 
493             if (sensor->hasTarget())
494             {
495                 values["target"] = sensor->getTarget();
496             }
497 
498             // convert between string/json to remove newlines
499             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
500 
501             if (sensor->hasTarget())
502             {
503                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
504             }
505 
506             if (sensor->getMethod() == MethodMode::count)
507             {
508                 values["ticks"] = sensor->getCounter();
509             }
510             data["sensors"][sensor->name()] = values;
511         }
512     }
513 
514     return data;
515 }
516 
517 void System::handleOfflineFanController()
518 {
519     getLogger().log("The fan controller appears to be offline.  Shutting down.",
520                     Logger::error);
521 
522     auto ffdc = collectHwmonFFDC();
523 
524     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
525                    Severity::Critical};
526     error.commit(ffdc, true);
527 
528     PowerInterface::executeHardPowerOff();
529 
530     createBmcDump();
531 }
532 
533 /**
534  * @brief Create a BMC Dump
535  */
536 void System::createBmcDump() const
537 {
538     try
539     {
540         util::SDBusPlus::callMethod(
541             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
542             "xyz.openbmc_project.Dump.Create", "CreateDump",
543             std::vector<
544                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
545     }
546     catch (const std::exception& e)
547     {
548         getLogger().log(
549             fmt::format("Caught exception while creating BMC dump: {}",
550                         e.what()),
551             Logger::error);
552     }
553 }
554 
555 } // namespace phosphor::fan::monitor
556