1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29 
30 #include "config.h"
31 
32 #include "hwmon_ffdc.hpp"
33 
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40 
41 namespace phosphor::fan::monitor
42 {
43 
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46 
47 using namespace phosphor::logging;
48 
49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
50 
51 System::System(Mode mode, sdbusplus::bus_t& bus,
52                const sdeventplus::Event& event) :
53     _mode(mode),
54     _bus(bus), _event(event),
55     _powerState(std::make_unique<PGoodState>(
56         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
57                        std::placeholders::_1))),
58     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
59 {}
60 
61 void System::start()
62 {
63     namespace match = sdbusplus::bus::match;
64 
65     // must be done before service detection
66     _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
67         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
68         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
69 
70     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
71         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
72         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
73 
74     if (invServiceRunning)
75     {
76         _inventoryMatch.reset();
77 
78         if (!_loaded)
79         {
80             load();
81         }
82     }
83 }
84 
85 void System::load()
86 {
87     json jsonObj = json::object();
88 #ifdef MONITOR_USE_JSON
89     try
90     {
91         jsonObj = getJsonObj();
92 #endif
93         auto trustGrps = getTrustGroups(jsonObj);
94         auto fanDefs = getFanDefinitions(jsonObj);
95         // Retrieve and set trust groups within the trust manager
96         setTrustMgr(getTrustGroups(jsonObj));
97         // Clear/set configured fan definitions
98         _fans.clear();
99         _fanHealth.clear();
100         // Retrieve fan definitions and create fan objects to be monitored
101         setFans(fanDefs);
102         setFaultConfig(jsonObj);
103         log<level::INFO>("Configuration loaded");
104 
105         _loaded = true;
106 #ifdef MONITOR_USE_JSON
107     }
108     catch (const phosphor::fan::NoConfigFound&)
109     {}
110 #endif
111 
112     if (_powerState->isPowerOn())
113     {
114         // Fans could be missing on startup, so check the power off rules.
115         // Tach sensors default to functional, so they wouldn't cause a power
116         // off here.
117         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
118                       [this](auto& rule) {
119                           rule->check(PowerRuleState::runtime, _fanHealth);
120                       });
121     }
122 
123     subscribeSensorsToServices();
124 }
125 
126 void System::subscribeSensorsToServices()
127 {
128     namespace match = sdbusplus::bus::match;
129 
130     _sensorMatch.clear();
131 
132     SensorMapType sensorMap;
133 
134     // build a list of all interfaces, always including the value interface
135     // using set automatically guards against duplicates
136     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
137 
138     for (const auto& fan : _fans)
139     {
140         for (const auto& sensor : fan->sensors())
141         {
142             unique_interfaces.insert(sensor->getInterface());
143         }
144     }
145     // convert them to vector to pass into getSubTreeRaw
146     std::vector<std::string> interfaces(unique_interfaces.begin(),
147                                         unique_interfaces.end());
148 
149     try
150     {
151         // get service information for all service names that are
152         // hosting these interfaces
153         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
154             _bus, FAN_SENSOR_PATH, interfaces, 0);
155 
156         for (const auto& fan : _fans)
157         {
158             // For every sensor in each fan
159             for (const auto& sensor : fan->sensors())
160             {
161                 const auto itServ = serviceObjects.find(sensor->name());
162 
163                 if (serviceObjects.end() == itServ || itServ->second.empty())
164                 {
165                     getLogger().log(
166                         fmt::format("Fan sensor entry {} not found in D-Bus",
167                                     sensor->name()),
168                         Logger::error);
169                     continue;
170                 }
171 
172                 for (const auto& [serviceName, unused] : itServ->second)
173                 {
174                     // associate service name with sensor
175                     sensorMap[serviceName].insert(sensor);
176                 }
177             }
178         }
179 
180         // only create 1 match per service
181         for (const auto& [serviceName, unused] : sensorMap)
182         {
183             // map its service name to the sensor
184             _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
185                 _bus, match::rules::nameOwnerChanged(serviceName),
186                 std::bind(&System::tachSignalOffline, this,
187                           std::placeholders::_1, sensorMap)));
188         }
189     }
190     catch (const util::DBusError&)
191     {
192         // catch exception from getSubTreeRaw() when fan sensor paths don't
193         // exist yet
194     }
195 }
196 
197 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
198 {
199     namespace match = sdbusplus::bus::match;
200 
201     std::string iface;
202     msg.read(iface);
203 
204     if (util::INVENTORY_INTF != iface)
205     {
206         return;
207     }
208 
209     std::string oldName;
210     msg.read(oldName);
211 
212     std::string newName;
213     msg.read(newName);
214 
215     // newName should never be empty since match was reset on the first
216     // nameOwnerChanged signal received from the service.
217     if (!_loaded && !newName.empty())
218     {
219         load();
220     }
221 
222     // cancel any further notifications about the service state
223     _inventoryMatch.reset();
224 }
225 
226 void System::sighupHandler(sdeventplus::source::Signal&,
227                            const struct signalfd_siginfo*)
228 {
229     try
230     {
231         load();
232     }
233     catch (std::runtime_error& re)
234     {
235         log<level::ERR>("Error reloading config, no config changes made",
236                         entry("LOAD_ERROR=%s", re.what()));
237     }
238 }
239 
240 const std::vector<CreateGroupFunction>
241     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
242 {
243 #ifdef MONITOR_USE_JSON
244     return getTrustGrps(jsonObj);
245 #else
246     return trustGroups;
247 #endif
248 }
249 
250 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
251 {
252     _trust = std::make_unique<trust::Manager>(groupFuncs);
253 }
254 
255 const std::vector<FanDefinition>
256     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
257 {
258 #ifdef MONITOR_USE_JSON
259     return getFanDefs(jsonObj);
260 #else
261     return fanDefinitions;
262 #endif
263 }
264 
265 void System::setFans(const std::vector<FanDefinition>& fanDefs)
266 {
267     for (const auto& fanDef : fanDefs)
268     {
269         // Check if a condition exists on the fan
270         auto condition = std::get<conditionField>(fanDef);
271         if (condition)
272         {
273             // Condition exists, skip adding fan if it fails
274             if (!(*condition)(_bus))
275             {
276                 continue;
277             }
278         }
279         _fans.emplace_back(
280             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
281 
282         updateFanHealth(*(_fans.back()));
283     }
284 }
285 
286 // callback indicating a service went [on|off]line.
287 // Determine on/offline status, set all sensors for that service
288 // to new state
289 //
290 void System::tachSignalOffline(sdbusplus::message_t& msg,
291                                SensorMapType const& sensorMap)
292 {
293     std::string serviceName, oldOwner, newOwner;
294 
295     msg.read(serviceName);
296     msg.read(oldOwner);
297     msg.read(newOwner);
298 
299     // true if sensor server came back online, false -> went offline
300     bool hasOwner = !newOwner.empty() && oldOwner.empty();
301 
302     std::string stateStr(hasOwner ? "online" : "offline");
303     getLogger().log(fmt::format("Changing sensors for service {} to {}",
304                                 serviceName, stateStr),
305                     Logger::info);
306 
307     auto sensorItr(sensorMap.find(serviceName));
308 
309     if (sensorItr != sensorMap.end())
310     {
311         // set all sensors' owner state to not-owned
312         for (auto& sensor : sensorItr->second)
313         {
314             sensor->setOwner(hasOwner);
315             sensor->getFan().process(*sensor);
316         }
317     }
318 }
319 
320 void System::updateFanHealth(const Fan& fan)
321 {
322     std::vector<bool> sensorStatus;
323     for (const auto& sensor : fan.sensors())
324     {
325         sensorStatus.push_back(sensor->functional());
326     }
327 
328     _fanHealth[fan.getName()] =
329         std::make_tuple(fan.present(), std::move(sensorStatus));
330 }
331 
332 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
333 {
334     updateFanHealth(fan);
335 
336     if (_powerState->isPowerOn() && !skipRulesCheck)
337     {
338         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
339                       [this](auto& rule) {
340                           rule->check(PowerRuleState::runtime, _fanHealth);
341                       });
342     }
343 }
344 
345 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
346 {
347 #ifdef MONITOR_USE_JSON
348     std::shared_ptr<PowerInterfaceBase> powerInterface =
349         std::make_shared<PowerInterface>(_thermalAlert);
350 
351     PowerOffAction::PrePowerOffFunc func =
352         std::bind(std::mem_fn(&System::logShutdownError), this);
353 
354     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
355 
356     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
357 #endif
358 }
359 
360 void System::powerStateChanged(bool powerStateOn)
361 {
362     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
363         fan->powerStateChanged(powerStateOn);
364     });
365 
366     if (powerStateOn)
367     {
368         if (!_loaded)
369         {
370             log<level::ERR>("No conf file found at power on");
371             throw std::runtime_error("No conf file found at power on");
372         }
373 
374         // If no fan has its sensors on D-Bus, then there is a problem
375         // with the fan controller.  Log an error and shut down.
376         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
377                 return fan->numSensorsOnDBusAtPowerOn() == 0;
378             }))
379         {
380             handleOfflineFanController();
381             return;
382         }
383 
384         if (_sensorMatch.empty())
385         {
386             subscribeSensorsToServices();
387         }
388 
389         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
390                       [this](auto& rule) {
391                           rule->check(PowerRuleState::atPgood, _fanHealth);
392                       });
393         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
394                       [this](auto& rule) {
395                           rule->check(PowerRuleState::runtime, _fanHealth);
396                       });
397     }
398     else
399     {
400         _thermalAlert.enabled(false);
401 
402         // Cancel any in-progress power off actions
403         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
404                       [this](auto& rule) { rule->cancel(); });
405     }
406 }
407 
408 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
409 {
410     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
411 
412     getLogger().log(
413         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
414                     sensor.name()),
415         Logger::error);
416 
417     // In order to know if the event log should have a severity of error or
418     // informational, count the number of existing nonfunctional sensors and
419     // compare it to _numNonfuncSensorsBeforeError.
420     size_t nonfuncSensors = 0;
421     for (const auto& fan : _fans)
422     {
423         for (const auto& s : fan->sensors())
424         {
425             // Don't count nonfunctional sensors that still have their
426             // error timer running as nonfunctional since they haven't
427             // had event logs created for those errors yet.
428             if (!s->functional() && !s->errorTimerRunning())
429             {
430                 nonfuncSensors++;
431             }
432         }
433     }
434 
435     Severity severity = Severity::Error;
436     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
437     {
438         severity = Severity::Informational;
439     }
440 
441     auto error =
442         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
443                                    fanPath, sensor.name(), severity);
444 
445     auto sensorData = captureSensorData();
446     error->commit(sensorData);
447 
448     // Save the error so it can be committed again on a power off.
449     _lastError = std::move(error);
450 }
451 
452 void System::fanMissingErrorTimerExpired(const Fan& fan)
453 {
454     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
455 
456     getLogger().log(
457         fmt::format("Creating event log for missing fan {}", fanPath),
458         Logger::error);
459 
460     auto error = std::make_unique<FanError>(
461         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
462 
463     auto sensorData = captureSensorData();
464     error->commit(sensorData);
465 
466     // Save the error so it can be committed again on a power off.
467     _lastError = std::move(error);
468 }
469 
470 void System::logShutdownError()
471 {
472     if (_lastError)
473     {
474         getLogger().log("Re-committing previous fan error before power off");
475 
476         // Still use the latest sensor data
477         auto sensorData = captureSensorData();
478         _lastError->commit(sensorData, true);
479     }
480 }
481 
482 json System::captureSensorData()
483 {
484     json data;
485 
486     for (const auto& fan : _fans)
487     {
488         for (const auto& sensor : fan->sensors())
489         {
490             json values;
491             values["present"] = fan->present();
492             values["functional"] = sensor->functional();
493             values["in_range"] = !fan->outOfRange(*sensor);
494             values["tach"] = sensor->getInput();
495 
496             if (sensor->hasTarget())
497             {
498                 values["target"] = sensor->getTarget();
499             }
500 
501             // convert between string/json to remove newlines
502             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
503 
504             if (sensor->hasTarget())
505             {
506                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
507             }
508 
509             if (sensor->getMethod() == MethodMode::count)
510             {
511                 values["ticks"] = sensor->getCounter();
512             }
513             data["sensors"][sensor->name()] = values;
514         }
515     }
516 
517     return data;
518 }
519 
520 void System::handleOfflineFanController()
521 {
522     getLogger().log("The fan controller appears to be offline.  Shutting down.",
523                     Logger::error);
524 
525     auto ffdc = collectHwmonFFDC();
526 
527     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
528                    Severity::Critical};
529     error.commit(ffdc, true);
530 
531     PowerInterface::executeHardPowerOff();
532 
533     createBmcDump();
534 }
535 
536 /**
537  * @brief Create a BMC Dump
538  */
539 void System::createBmcDump() const
540 {
541     try
542     {
543         util::SDBusPlus::callMethod(
544             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
545             "xyz.openbmc_project.Dump.Create", "CreateDump",
546             std::vector<
547                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
548     }
549     catch (const std::exception& e)
550     {
551         getLogger().log(
552             fmt::format("Caught exception while creating BMC dump: {}",
553                         e.what()),
554             Logger::error);
555     }
556 }
557 
558 void System::dumpDebugData(sdeventplus::source::Signal&,
559                            const struct signalfd_siginfo*)
560 {
561     json output;
562 
563     if (_loaded)
564     {
565         output["logs"] = getLogger().getLogs();
566         output["sensors"] = captureSensorData();
567     }
568     else
569     {
570         output["error"] = "Fan monitor not loaded yet.  Try again later.";
571     }
572 
573     std::ofstream file{System::dumpFile};
574     if (!file)
575     {
576         log<level::ERR>("Could not open file for fan monitor dump");
577     }
578     else
579     {
580         file << std::setw(4) << output;
581     }
582 }
583 
584 } // namespace phosphor::fan::monitor
585