xref: /openbmc/phosphor-fan-presence/monitor/system.cpp (revision b2e9a4fcc2253bcb585e92b4642ed4b0036941df)
1 /**
2  * Copyright © 2022 IBM Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "system.hpp"
17 
18 #include "fan.hpp"
19 #include "fan_defs.hpp"
20 #include "tach_sensor.hpp"
21 #include "trust_manager.hpp"
22 #include "types.hpp"
23 #include "utility.hpp"
24 #ifdef MONITOR_USE_JSON
25 #include "json_config.hpp"
26 #include "json_parser.hpp"
27 #endif
28 
29 #include "config.h"
30 
31 #include "hwmon_ffdc.hpp"
32 
33 #include <nlohmann/json.hpp>
34 #include <phosphor-logging/log.hpp>
35 #include <sdbusplus/bus.hpp>
36 #include <sdeventplus/event.hpp>
37 #include <sdeventplus/source/signal.hpp>
38 
39 namespace phosphor::fan::monitor
40 {
41 
42 using json = nlohmann::json;
43 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
44 
45 using namespace phosphor::logging;
46 
47 System::System(Mode mode, sdbusplus::bus::bus& bus,
48                const sdeventplus::Event& event) :
49     _mode(mode),
50     _bus(bus), _event(event),
51     _powerState(std::make_unique<PGoodState>(
52         bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
53                        std::placeholders::_1))),
54     _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
55 {}
56 
57 void System::start()
58 {
59     namespace match = sdbusplus::bus::match;
60 
61     // must be done before service detection
62     _inventoryMatch = std::make_unique<match::match>(
63         _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
64         std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
65 
66     bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
67         _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
68         "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
69 
70     if (invServiceRunning)
71     {
72         _inventoryMatch.reset();
73 
74         if (!_loaded)
75         {
76             load();
77         }
78     }
79 }
80 
81 void System::load()
82 {
83     json jsonObj = json::object();
84 #ifdef MONITOR_USE_JSON
85     try
86     {
87         jsonObj = getJsonObj();
88 #endif
89         auto trustGrps = getTrustGroups(jsonObj);
90         auto fanDefs = getFanDefinitions(jsonObj);
91         // Retrieve and set trust groups within the trust manager
92         setTrustMgr(getTrustGroups(jsonObj));
93         // Clear/set configured fan definitions
94         _fans.clear();
95         _fanHealth.clear();
96         // Retrieve fan definitions and create fan objects to be monitored
97         setFans(fanDefs);
98         setFaultConfig(jsonObj);
99         log<level::INFO>("Configuration loaded");
100 
101         _loaded = true;
102 #ifdef MONITOR_USE_JSON
103     }
104     catch (const phosphor::fan::NoConfigFound&)
105     {}
106 #endif
107 
108     if (_powerState->isPowerOn())
109     {
110         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
111                       [this](auto& rule) {
112                           rule->check(PowerRuleState::runtime, _fanHealth);
113                       });
114     }
115 
116     subscribeSensorsToServices();
117 }
118 
119 void System::subscribeSensorsToServices()
120 {
121     namespace match = sdbusplus::bus::match;
122 
123     _sensorMatch.clear();
124 
125     SensorMapType sensorMap;
126 
127     // build a list of all interfaces, always including the value interface
128     // using set automatically guards against duplicates
129     std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
130 
131     for (const auto& fan : _fans)
132     {
133         for (const auto& sensor : fan->sensors())
134         {
135             unique_interfaces.insert(sensor->getInterface());
136         }
137     }
138     // convert them to vector to pass into getSubTreeRaw
139     std::vector<std::string> interfaces(unique_interfaces.begin(),
140                                         unique_interfaces.end());
141 
142     try
143     {
144         // get service information for all service names that are
145         // hosting these interfaces
146         auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
147             _bus, FAN_SENSOR_PATH, interfaces, 0);
148 
149         for (const auto& fan : _fans)
150         {
151             // For every sensor in each fan
152             for (const auto& sensor : fan->sensors())
153             {
154                 const auto itServ = serviceObjects.find(sensor->name());
155 
156                 if (serviceObjects.end() == itServ || itServ->second.empty())
157                 {
158                     getLogger().log(
159                         fmt::format("Fan sensor entry {} not found in D-Bus",
160                                     sensor->name()),
161                         Logger::error);
162                     continue;
163                 }
164 
165                 for (const auto& [serviceName, unused] : itServ->second)
166                 {
167                     // associate service name with sensor
168                     sensorMap[serviceName].insert(sensor);
169                 }
170             }
171         }
172 
173         // only create 1 match per service
174         for (const auto& [serviceName, unused] : sensorMap)
175         {
176             // map its service name to the sensor
177             _sensorMatch.emplace_back(std::make_unique<match::match>(
178                 _bus, match::rules::nameOwnerChanged(serviceName),
179                 std::bind(&System::tachSignalOffline, this,
180                           std::placeholders::_1, sensorMap)));
181         }
182     }
183     catch (const util::DBusError&)
184     {
185         // catch exception from getSubTreeRaw() when fan sensor paths don't
186         // exist yet
187     }
188 }
189 
190 void System::inventoryOnlineCb(sdbusplus::message::message& msg)
191 {
192     namespace match = sdbusplus::bus::match;
193 
194     std::string iface;
195     msg.read(iface);
196 
197     if (util::INVENTORY_INTF != iface)
198     {
199         return;
200     }
201 
202     std::string oldName;
203     msg.read(oldName);
204 
205     std::string newName;
206     msg.read(newName);
207 
208     // newName should never be empty since match was reset on the first
209     // nameOwnerChanged signal received from the service.
210     if (!_loaded && !newName.empty())
211     {
212         load();
213     }
214 
215     // cancel any further notifications about the service state
216     _inventoryMatch.reset();
217 }
218 
219 void System::sighupHandler(sdeventplus::source::Signal&,
220                            const struct signalfd_siginfo*)
221 {
222     try
223     {
224         load();
225     }
226     catch (std::runtime_error& re)
227     {
228         log<level::ERR>("Error reloading config, no config changes made",
229                         entry("LOAD_ERROR=%s", re.what()));
230     }
231 }
232 
233 const std::vector<CreateGroupFunction>
234     System::getTrustGroups([[maybe_unused]] const json& jsonObj)
235 {
236 #ifdef MONITOR_USE_JSON
237     return getTrustGrps(jsonObj);
238 #else
239     return trustGroups;
240 #endif
241 }
242 
243 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
244 {
245     _trust = std::make_unique<trust::Manager>(groupFuncs);
246 }
247 
248 const std::vector<FanDefinition>
249     System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
250 {
251 #ifdef MONITOR_USE_JSON
252     return getFanDefs(jsonObj);
253 #else
254     return fanDefinitions;
255 #endif
256 }
257 
258 void System::setFans(const std::vector<FanDefinition>& fanDefs)
259 {
260     for (const auto& fanDef : fanDefs)
261     {
262         // Check if a condition exists on the fan
263         auto condition = std::get<conditionField>(fanDef);
264         if (condition)
265         {
266             // Condition exists, skip adding fan if it fails
267             if (!(*condition)(_bus))
268             {
269                 continue;
270             }
271         }
272         _fans.emplace_back(
273             std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
274 
275         updateFanHealth(*(_fans.back()));
276     }
277 }
278 
279 // callback indicating a service went [on|off]line.
280 // Determine on/offline status, set all sensors for that service
281 // to new state
282 //
283 void System::tachSignalOffline(sdbusplus::message::message& msg,
284                                SensorMapType const& sensorMap)
285 {
286     std::string serviceName, oldOwner, newOwner;
287 
288     msg.read(serviceName);
289     msg.read(oldOwner);
290     msg.read(newOwner);
291 
292     // true if sensor server came back online, false -> went offline
293     bool hasOwner = !newOwner.empty() && oldOwner.empty();
294 
295     std::string stateStr(hasOwner ? "online" : "offline");
296     getLogger().log(fmt::format("Changing sensors for service {} to {}",
297                                 serviceName, stateStr),
298                     Logger::info);
299 
300     auto sensorItr(sensorMap.find(serviceName));
301 
302     if (sensorItr != sensorMap.end())
303     {
304         // set all sensors' owner state to not-owned
305         for (auto& sensor : sensorItr->second)
306         {
307             sensor->setOwner(hasOwner);
308             sensor->getFan().process(*sensor);
309         }
310     }
311 }
312 
313 void System::updateFanHealth(const Fan& fan)
314 {
315     std::vector<bool> sensorStatus;
316     for (const auto& sensor : fan.sensors())
317     {
318         sensorStatus.push_back(sensor->functional());
319     }
320 
321     _fanHealth[fan.getName()] =
322         std::make_tuple(fan.present(), std::move(sensorStatus));
323 }
324 
325 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
326 {
327     updateFanHealth(fan);
328 
329     if (_powerState->isPowerOn() && !skipRulesCheck)
330     {
331         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
332                       [this](auto& rule) {
333                           rule->check(PowerRuleState::runtime, _fanHealth);
334                       });
335     }
336 }
337 
338 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
339 {
340 #ifdef MONITOR_USE_JSON
341     std::shared_ptr<PowerInterfaceBase> powerInterface =
342         std::make_shared<PowerInterface>(_thermalAlert);
343 
344     PowerOffAction::PrePowerOffFunc func =
345         std::bind(std::mem_fn(&System::logShutdownError), this);
346 
347     _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
348 
349     _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
350 #endif
351 }
352 
353 void System::powerStateChanged(bool powerStateOn)
354 {
355     std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
356         fan->powerStateChanged(powerStateOn);
357     });
358 
359     if (powerStateOn)
360     {
361         if (!_loaded)
362         {
363             log<level::ERR>("No conf file found at power on");
364             throw std::runtime_error("No conf file found at power on");
365         }
366 
367         // If no fan has its sensors on D-Bus, then there is a problem
368         // with the fan controller.  Log an error and shut down.
369         if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
370                 return fan->numSensorsOnDBusAtPowerOn() == 0;
371             }))
372         {
373             handleOfflineFanController();
374             return;
375         }
376 
377         if (_sensorMatch.empty())
378         {
379             subscribeSensorsToServices();
380         }
381 
382         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
383                       [this](auto& rule) {
384                           rule->check(PowerRuleState::atPgood, _fanHealth);
385                       });
386         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
387                       [this](auto& rule) {
388                           rule->check(PowerRuleState::runtime, _fanHealth);
389                       });
390     }
391     else
392     {
393         _thermalAlert.enabled(false);
394 
395         // Cancel any in-progress power off actions
396         std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
397                       [this](auto& rule) { rule->cancel(); });
398     }
399 }
400 
401 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
402 {
403     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
404 
405     getLogger().log(
406         fmt::format("Creating event log for faulted fan {} sensor {}", fanPath,
407                     sensor.name()),
408         Logger::error);
409 
410     // In order to know if the event log should have a severity of error or
411     // informational, count the number of existing nonfunctional sensors and
412     // compare it to _numNonfuncSensorsBeforeError.
413     size_t nonfuncSensors = 0;
414     for (const auto& fan : _fans)
415     {
416         for (const auto& s : fan->sensors())
417         {
418             // Don't count nonfunctional sensors that still have their
419             // error timer running as nonfunctional since they haven't
420             // had event logs created for those errors yet.
421             if (!s->functional() && !s->errorTimerRunning())
422             {
423                 nonfuncSensors++;
424             }
425         }
426     }
427 
428     Severity severity = Severity::Error;
429     if (nonfuncSensors < _numNonfuncSensorsBeforeError)
430     {
431         severity = Severity::Informational;
432     }
433 
434     auto error =
435         std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
436                                    fanPath, sensor.name(), severity);
437 
438     auto sensorData = captureSensorData();
439     error->commit(sensorData);
440 
441     // Save the error so it can be committed again on a power off.
442     _lastError = std::move(error);
443 }
444 
445 void System::fanMissingErrorTimerExpired(const Fan& fan)
446 {
447     std::string fanPath{util::INVENTORY_PATH + fan.getName()};
448 
449     getLogger().log(
450         fmt::format("Creating event log for missing fan {}", fanPath),
451         Logger::error);
452 
453     auto error = std::make_unique<FanError>(
454         "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
455 
456     auto sensorData = captureSensorData();
457     error->commit(sensorData);
458 
459     // Save the error so it can be committed again on a power off.
460     _lastError = std::move(error);
461 }
462 
463 void System::logShutdownError()
464 {
465     if (_lastError)
466     {
467         getLogger().log("Re-committing previous fan error before power off");
468 
469         // Still use the latest sensor data
470         auto sensorData = captureSensorData();
471         _lastError->commit(sensorData, true);
472     }
473 }
474 
475 json System::captureSensorData()
476 {
477     json data;
478 
479     for (const auto& fan : _fans)
480     {
481         for (const auto& sensor : fan->sensors())
482         {
483             json values;
484             values["present"] = fan->present();
485             values["functional"] = sensor->functional();
486             values["tach"] = sensor->getInput();
487 
488             if (sensor->hasTarget())
489             {
490                 values["target"] = sensor->getTarget();
491             }
492 
493             // convert between string/json to remove newlines
494             values["prev_tachs"] = json(sensor->getPrevTach()).dump();
495 
496             if (sensor->hasTarget())
497             {
498                 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
499             }
500 
501             data["sensors"][sensor->name()] = values;
502         }
503     }
504 
505     return data;
506 }
507 
508 void System::handleOfflineFanController()
509 {
510     getLogger().log("The fan controller appears to be offline.  Shutting down.",
511                     Logger::error);
512 
513     auto ffdc = collectHwmonFFDC();
514 
515     FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
516                    Severity::Critical};
517     error.commit(ffdc, true);
518 
519     PowerInterface::executeHardPowerOff();
520 
521     createBmcDump();
522 }
523 
524 /**
525  * @brief Create a BMC Dump
526  */
527 void System::createBmcDump() const
528 {
529     try
530     {
531         util::SDBusPlus::callMethod(
532             "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
533             "xyz.openbmc_project.Dump.Create", "CreateDump",
534             std::vector<
535                 std::pair<std::string, std::variant<std::string, uint64_t>>>());
536     }
537     catch (const sdbusplus::exception::exception&)
538     {}
539 }
540 
541 } // namespace phosphor::fan::monitor
542