1 /**
2 * Copyright © 2022 IBM Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "system.hpp"
17
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29
30 #include "config.h"
31
32 #include "hwmon_ffdc.hpp"
33
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40
41 namespace phosphor::fan::monitor
42 {
43
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46
47 using namespace phosphor::logging;
48
49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
50
System(Mode mode,sdbusplus::bus_t & bus,const sdeventplus::Event & event)51 System::System(Mode mode, sdbusplus::bus_t& bus,
52 const sdeventplus::Event& event) :
53 _mode(mode),
54 _bus(bus), _event(event),
55 #ifdef MONITOR_USE_HOST_STATE
56 _powerState(std::make_unique<HostPowerState>(
57 #else
58 _powerState(std::make_unique<PGoodState>(
59 #endif
60 bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
61 std::placeholders::_1))),
62 _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
63 {}
64
start()65 void System::start()
66 {
67 namespace match = sdbusplus::bus::match;
68
69 // must be done before service detection
70 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
71 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
72 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
73
74 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
75 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
76 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
77
78 if (invServiceRunning)
79 {
80 _inventoryMatch.reset();
81
82 if (!_loaded)
83 {
84 load();
85 }
86 }
87 }
88
load()89 void System::load()
90 {
91 json jsonObj = json::object();
92 #ifdef MONITOR_USE_JSON
93 try
94 {
95 jsonObj = getJsonObj();
96 #endif
97 auto trustGrps = getTrustGroups(jsonObj);
98 auto fanDefs = getFanDefinitions(jsonObj);
99 // Retrieve and set trust groups within the trust manager
100 setTrustMgr(getTrustGroups(jsonObj));
101 // Clear/set configured fan definitions
102 _fans.clear();
103 _fanHealth.clear();
104 // Retrieve fan definitions and create fan objects to be monitored
105 setFans(fanDefs);
106 setFaultConfig(jsonObj);
107 log<level::INFO>("Configuration loaded");
108
109 _loaded = true;
110 #ifdef MONITOR_USE_JSON
111 }
112 catch (const phosphor::fan::NoConfigFound&)
113 {}
114 #endif
115
116 if (_powerState->isPowerOn())
117 {
118 // Fans could be missing on startup, so check the power off rules.
119 // Tach sensors default to functional, so they wouldn't cause a power
120 // off here.
121 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
122 [this](auto& rule) {
123 rule->check(PowerRuleState::runtime, _fanHealth);
124 });
125 }
126
127 subscribeSensorsToServices();
128 }
129
subscribeSensorsToServices()130 void System::subscribeSensorsToServices()
131 {
132 namespace match = sdbusplus::bus::match;
133
134 _sensorMatch.clear();
135
136 SensorMapType sensorMap;
137
138 // build a list of all interfaces, always including the value interface
139 // using set automatically guards against duplicates
140 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
141
142 for (const auto& fan : _fans)
143 {
144 for (const auto& sensor : fan->sensors())
145 {
146 unique_interfaces.insert(sensor->getInterface());
147 }
148 }
149 // convert them to vector to pass into getSubTreeRaw
150 std::vector<std::string> interfaces(unique_interfaces.begin(),
151 unique_interfaces.end());
152
153 try
154 {
155 // get service information for all service names that are
156 // hosting these interfaces
157 auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
158 _bus, FAN_SENSOR_PATH, interfaces, 0);
159
160 for (const auto& fan : _fans)
161 {
162 // For every sensor in each fan
163 for (const auto& sensor : fan->sensors())
164 {
165 const auto itServ = serviceObjects.find(sensor->name());
166
167 if (serviceObjects.end() == itServ || itServ->second.empty())
168 {
169 getLogger().log(
170 std::format("Fan sensor entry {} not found in D-Bus",
171 sensor->name()),
172 Logger::error);
173 continue;
174 }
175
176 for (const auto& [serviceName, unused] : itServ->second)
177 {
178 // associate service name with sensor
179 sensorMap[serviceName].insert(sensor);
180 }
181 }
182 }
183
184 // only create 1 match per service
185 for (const auto& [serviceName, unused] : sensorMap)
186 {
187 // map its service name to the sensor
188 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
189 _bus, match::rules::nameOwnerChanged(serviceName),
190 std::bind(&System::tachSignalOffline, this,
191 std::placeholders::_1, sensorMap)));
192 }
193 }
194 catch (const util::DBusError&)
195 {
196 // catch exception from getSubTreeRaw() when fan sensor paths don't
197 // exist yet
198 }
199 }
200
inventoryOnlineCb(sdbusplus::message_t & msg)201 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
202 {
203 namespace match = sdbusplus::bus::match;
204
205 std::string iface;
206 msg.read(iface);
207
208 if (util::INVENTORY_INTF != iface)
209 {
210 return;
211 }
212
213 std::string oldName;
214 msg.read(oldName);
215
216 std::string newName;
217 msg.read(newName);
218
219 // newName should never be empty since match was reset on the first
220 // nameOwnerChanged signal received from the service.
221 if (!_loaded && !newName.empty())
222 {
223 load();
224 }
225
226 // cancel any further notifications about the service state
227 _inventoryMatch.reset();
228 }
229
sighupHandler(sdeventplus::source::Signal &,const struct signalfd_siginfo *)230 void System::sighupHandler(sdeventplus::source::Signal&,
231 const struct signalfd_siginfo*)
232 {
233 try
234 {
235 load();
236 }
237 catch (std::runtime_error& re)
238 {
239 log<level::ERR>("Error reloading config, no config changes made",
240 entry("LOAD_ERROR=%s", re.what()));
241 }
242 }
243
244 const std::vector<CreateGroupFunction>
getTrustGroups(const json & jsonObj)245 System::getTrustGroups([[maybe_unused]] const json& jsonObj)
246 {
247 #ifdef MONITOR_USE_JSON
248 return getTrustGrps(jsonObj);
249 #else
250 return trustGroups;
251 #endif
252 }
253
setTrustMgr(const std::vector<CreateGroupFunction> & groupFuncs)254 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
255 {
256 _trust = std::make_unique<trust::Manager>(groupFuncs);
257 }
258
259 const std::vector<FanDefinition>
getFanDefinitions(const json & jsonObj)260 System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
261 {
262 #ifdef MONITOR_USE_JSON
263 return getFanDefs(jsonObj);
264 #else
265 return fanDefinitions;
266 #endif
267 }
268
setFans(const std::vector<FanDefinition> & fanDefs)269 void System::setFans(const std::vector<FanDefinition>& fanDefs)
270 {
271 for (const auto& fanDef : fanDefs)
272 {
273 // Check if a condition exists on the fan
274 auto condition = fanDef.condition;
275 if (condition)
276 {
277 // Condition exists, skip adding fan if it fails
278 if (!(*condition)(_bus))
279 {
280 continue;
281 }
282 }
283 _fans.emplace_back(
284 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
285
286 updateFanHealth(*(_fans.back()));
287 }
288 }
289
290 // callback indicating a service went [on|off]line.
291 // Determine on/offline status, set all sensors for that service
292 // to new state
293 //
tachSignalOffline(sdbusplus::message_t & msg,const SensorMapType & sensorMap)294 void System::tachSignalOffline(sdbusplus::message_t& msg,
295 const SensorMapType& sensorMap)
296 {
297 std::string serviceName, oldOwner, newOwner;
298
299 msg.read(serviceName);
300 msg.read(oldOwner);
301 msg.read(newOwner);
302
303 // true if sensor server came back online, false -> went offline
304 bool hasOwner = !newOwner.empty() && oldOwner.empty();
305
306 std::string stateStr(hasOwner ? "online" : "offline");
307 getLogger().log(std::format("Changing sensors for service {} to {}",
308 serviceName, stateStr),
309 Logger::info);
310
311 auto sensorItr(sensorMap.find(serviceName));
312
313 if (sensorItr != sensorMap.end())
314 {
315 // set all sensors' owner state to not-owned
316 for (auto& sensor : sensorItr->second)
317 {
318 sensor->setOwner(hasOwner);
319 sensor->getFan().process(*sensor);
320 }
321 }
322 }
323
updateFanHealth(const Fan & fan)324 void System::updateFanHealth(const Fan& fan)
325 {
326 std::vector<bool> sensorStatus;
327 for (const auto& sensor : fan.sensors())
328 {
329 sensorStatus.push_back(sensor->functional());
330 }
331
332 _fanHealth[fan.getName()] = std::make_tuple(fan.present(),
333 std::move(sensorStatus));
334 }
335
fanStatusChange(const Fan & fan,bool skipRulesCheck)336 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
337 {
338 updateFanHealth(fan);
339
340 if (_powerState->isPowerOn() && !skipRulesCheck)
341 {
342 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
343 [this](auto& rule) {
344 rule->check(PowerRuleState::runtime, _fanHealth);
345 });
346 }
347 }
348
setFaultConfig(const json & jsonObj)349 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
350 {
351 #ifdef MONITOR_USE_JSON
352 std::shared_ptr<PowerInterfaceBase> powerInterface =
353 std::make_shared<PowerInterface>(_thermalAlert);
354
355 PowerOffAction::PrePowerOffFunc func =
356 std::bind(std::mem_fn(&System::logShutdownError), this);
357
358 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
359
360 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
361 #endif
362 }
363
powerStateChanged(bool powerStateOn)364 void System::powerStateChanged(bool powerStateOn)
365 {
366 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
367 fan->powerStateChanged(powerStateOn);
368 });
369
370 if (powerStateOn)
371 {
372 if (!_loaded)
373 {
374 log<level::ERR>("No conf file found at power on");
375 throw std::runtime_error("No conf file found at power on");
376 }
377
378 // If no fan has its sensors on D-Bus, then there is a problem
379 // with the fan controller. Log an error and shut down.
380 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
381 return fan->numSensorsOnDBusAtPowerOn() == 0;
382 }))
383 {
384 #if DELAY_HOST_CONTROL > 0
385 sleep(DELAY_HOST_CONTROL);
386 std::for_each(_fans.begin(), _fans.end(),
387 [powerStateOn](auto& fan) {
388 fan->powerStateChanged(powerStateOn);
389 });
390 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
391 return fan->numSensorsOnDBusAtPowerOn() == 0;
392 }))
393 {
394 handleOfflineFanController();
395 return;
396 }
397 #else
398 handleOfflineFanController();
399 return;
400 #endif
401 }
402
403 if (_sensorMatch.empty())
404 {
405 subscribeSensorsToServices();
406 }
407
408 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
409 [this](auto& rule) {
410 rule->check(PowerRuleState::atPgood, _fanHealth);
411 });
412 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
413 [this](auto& rule) {
414 rule->check(PowerRuleState::runtime, _fanHealth);
415 });
416 }
417 else
418 {
419 _thermalAlert.enabled(false);
420
421 // Cancel any in-progress power off actions
422 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
423 [this](auto& rule) { rule->cancel(); });
424 }
425 }
426
sensorErrorTimerExpired(const Fan & fan,const TachSensor & sensor)427 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
428 {
429 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
430
431 getLogger().log(
432 std::format("Creating event log for faulted fan {} sensor {}", fanPath,
433 sensor.name()),
434 Logger::error);
435
436 // In order to know if the event log should have a severity of error or
437 // informational, count the number of existing nonfunctional sensors and
438 // compare it to _numNonfuncSensorsBeforeError.
439 size_t nonfuncSensors = 0;
440 for (const auto& fan : _fans)
441 {
442 for (const auto& s : fan->sensors())
443 {
444 // Don't count nonfunctional sensors that still have their
445 // error timer running as nonfunctional since they haven't
446 // had event logs created for those errors yet.
447 if (!s->functional() && !s->errorTimerRunning())
448 {
449 nonfuncSensors++;
450 }
451 }
452 }
453
454 Severity severity = Severity::Error;
455 if (nonfuncSensors < _numNonfuncSensorsBeforeError)
456 {
457 severity = Severity::Informational;
458 }
459
460 auto error =
461 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
462 fanPath, sensor.name(), severity);
463
464 auto sensorData = captureSensorData();
465 error->commit(sensorData);
466
467 // Save the error so it can be committed again on a power off.
468 _lastError = std::move(error);
469 }
470
fanMissingErrorTimerExpired(const Fan & fan)471 void System::fanMissingErrorTimerExpired(const Fan& fan)
472 {
473 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
474
475 getLogger().log(
476 std::format("Creating event log for missing fan {}", fanPath),
477 Logger::error);
478
479 auto error = std::make_unique<FanError>(
480 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
481
482 auto sensorData = captureSensorData();
483 error->commit(sensorData);
484
485 // Save the error so it can be committed again on a power off.
486 _lastError = std::move(error);
487 }
488
logShutdownError()489 void System::logShutdownError()
490 {
491 if (_lastError)
492 {
493 getLogger().log("Re-committing previous fan error before power off");
494
495 // Still use the latest sensor data
496 auto sensorData = captureSensorData();
497 _lastError->commit(sensorData, true);
498 }
499 }
500
captureSensorData()501 json System::captureSensorData()
502 {
503 json data;
504
505 for (const auto& fan : _fans)
506 {
507 for (const auto& sensor : fan->sensors())
508 {
509 json values;
510 values["present"] = fan->present();
511 values["functional"] = sensor->functional();
512 values["in_range"] = !fan->outOfRange(*sensor);
513 values["tach"] = sensor->getInput();
514
515 if (sensor->hasTarget())
516 {
517 values["target"] = sensor->getTarget();
518 }
519
520 // convert between string/json to remove newlines
521 values["prev_tachs"] = json(sensor->getPrevTach()).dump();
522
523 if (sensor->hasTarget())
524 {
525 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
526 }
527
528 if (sensor->getMethod() == MethodMode::count)
529 {
530 values["ticks"] = sensor->getCounter();
531 }
532 data["sensors"][sensor->name()] = values;
533 }
534 }
535
536 return data;
537 }
538
handleOfflineFanController()539 void System::handleOfflineFanController()
540 {
541 getLogger().log("The fan controller appears to be offline. Shutting down.",
542 Logger::error);
543
544 auto ffdc = collectHwmonFFDC();
545
546 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
547 Severity::Critical};
548 error.commit(ffdc, true);
549
550 PowerInterface::executeHardPowerOff();
551
552 createBmcDump();
553 }
554
555 /**
556 * @brief Create a BMC Dump
557 */
createBmcDump() const558 void System::createBmcDump() const
559 {
560 try
561 {
562 util::SDBusPlus::callMethod(
563 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
564 "xyz.openbmc_project.Dump.Create", "CreateDump",
565 std::vector<
566 std::pair<std::string, std::variant<std::string, uint64_t>>>());
567 }
568 catch (const std::exception& e)
569 {
570 getLogger().log(
571 std::format("Caught exception while creating BMC dump: {}",
572 e.what()),
573 Logger::error);
574 }
575 }
576
dumpDebugData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)577 void System::dumpDebugData(sdeventplus::source::Signal&,
578 const struct signalfd_siginfo*)
579 {
580 json output;
581
582 if (_loaded)
583 {
584 output["logs"] = getLogger().getLogs();
585 output["sensors"] = captureSensorData();
586 }
587 else
588 {
589 output["error"] = "Fan monitor not loaded yet. Try again later.";
590 }
591
592 std::ofstream file{System::dumpFile};
593 if (!file)
594 {
595 log<level::ERR>("Could not open file for fan monitor dump");
596 }
597 else
598 {
599 file << std::setw(4) << output;
600 }
601 }
602
603 } // namespace phosphor::fan::monitor
604