1 /**
2 * Copyright © 2022 IBM Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "system.hpp"
17
18 #include "dbus_paths.hpp"
19 #include "fan.hpp"
20 #include "fan_defs.hpp"
21 #include "tach_sensor.hpp"
22 #include "trust_manager.hpp"
23 #include "types.hpp"
24 #include "utility.hpp"
25 #ifdef MONITOR_USE_JSON
26 #include "json_config.hpp"
27 #include "json_parser.hpp"
28 #endif
29
30 #include "config.h"
31
32 #include "hwmon_ffdc.hpp"
33
34 #include <nlohmann/json.hpp>
35 #include <phosphor-logging/log.hpp>
36 #include <sdbusplus/bus.hpp>
37 #include <sdbusplus/bus/match.hpp>
38 #include <sdeventplus/event.hpp>
39 #include <sdeventplus/source/signal.hpp>
40
41 namespace phosphor::fan::monitor
42 {
43
44 using json = nlohmann::json;
45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level;
46
47 using namespace phosphor::logging;
48
49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json";
50
System(Mode mode,sdbusplus::bus_t & bus,const sdeventplus::Event & event)51 System::System(Mode mode, sdbusplus::bus_t& bus,
52 const sdeventplus::Event& event) :
53 _mode(mode), _bus(bus), _event(event),
54 #ifdef MONITOR_USE_HOST_STATE
55 _powerState(std::make_unique<HostPowerState>(
56 #else
57 _powerState(std::make_unique<PGoodState>(
58 #endif
59 bus, std::bind(std::mem_fn(&System::powerStateChanged), this,
60 std::placeholders::_1))),
61 _thermalAlert(bus, THERMAL_ALERT_OBJPATH)
62 {}
63
start()64 void System::start()
65 {
66 namespace match = sdbusplus::bus::match;
67
68 // must be done before service detection
69 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>(
70 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC),
71 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1));
72
73 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>(
74 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus",
75 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC);
76
77 if (invServiceRunning)
78 {
79 _inventoryMatch.reset();
80
81 if (!_loaded)
82 {
83 load();
84 }
85 }
86 }
87
load()88 void System::load()
89 {
90 json jsonObj = json::object();
91 #ifdef MONITOR_USE_JSON
92 try
93 {
94 jsonObj = getJsonObj();
95 #endif
96 auto trustGrps = getTrustGroups(jsonObj);
97 auto fanDefs = getFanDefinitions(jsonObj);
98 // Retrieve and set trust groups within the trust manager
99 setTrustMgr(getTrustGroups(jsonObj));
100 // Clear/set configured fan definitions
101 _fans.clear();
102 _fanHealth.clear();
103 // Retrieve fan definitions and create fan objects to be monitored
104 setFans(fanDefs);
105 setFaultConfig(jsonObj);
106 log<level::INFO>("Configuration loaded");
107
108 _loaded = true;
109 #ifdef MONITOR_USE_JSON
110 }
111 catch (const phosphor::fan::NoConfigFound&)
112 {}
113 #endif
114
115 if (_powerState->isPowerOn())
116 {
117 // Fans could be missing on startup, so check the power off rules.
118 // Tach sensors default to functional, so they wouldn't cause a power
119 // off here.
120 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
121 [this](auto& rule) {
122 rule->check(PowerRuleState::runtime, _fanHealth);
123 });
124 }
125
126 subscribeSensorsToServices();
127 }
128
subscribeSensorsToServices()129 void System::subscribeSensorsToServices()
130 {
131 namespace match = sdbusplus::bus::match;
132
133 _sensorMatch.clear();
134
135 SensorMapType sensorMap;
136
137 // build a list of all interfaces, always including the value interface
138 // using set automatically guards against duplicates
139 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF};
140
141 for (const auto& fan : _fans)
142 {
143 for (const auto& sensor : fan->sensors())
144 {
145 unique_interfaces.insert(sensor->getInterface());
146 }
147 }
148 // convert them to vector to pass into getSubTreeRaw
149 std::vector<std::string> interfaces(unique_interfaces.begin(),
150 unique_interfaces.end());
151
152 try
153 {
154 // get service information for all service names that are
155 // hosting these interfaces
156 auto serviceObjects = util::SDBusPlus::getSubTreeRaw(
157 _bus, FAN_SENSOR_PATH, interfaces, 0);
158
159 for (const auto& fan : _fans)
160 {
161 // For every sensor in each fan
162 for (const auto& sensor : fan->sensors())
163 {
164 const auto itServ = serviceObjects.find(sensor->name());
165
166 if (serviceObjects.end() == itServ || itServ->second.empty())
167 {
168 getLogger().log(
169 std::format("Fan sensor entry {} not found in D-Bus",
170 sensor->name()),
171 Logger::error);
172 continue;
173 }
174
175 for (const auto& [serviceName, unused] : itServ->second)
176 {
177 // associate service name with sensor
178 sensorMap[serviceName].insert(sensor);
179 }
180 }
181 }
182
183 // only create 1 match per service
184 for (const auto& [serviceName, unused] : sensorMap)
185 {
186 // map its service name to the sensor
187 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>(
188 _bus, match::rules::nameOwnerChanged(serviceName),
189 std::bind(&System::tachSignalOffline, this,
190 std::placeholders::_1, sensorMap)));
191 }
192 }
193 catch (const util::DBusError&)
194 {
195 // catch exception from getSubTreeRaw() when fan sensor paths don't
196 // exist yet
197 }
198 }
199
inventoryOnlineCb(sdbusplus::message_t & msg)200 void System::inventoryOnlineCb(sdbusplus::message_t& msg)
201 {
202 namespace match = sdbusplus::bus::match;
203
204 std::string iface;
205 msg.read(iface);
206
207 if (util::INVENTORY_INTF != iface)
208 {
209 return;
210 }
211
212 std::string oldName;
213 msg.read(oldName);
214
215 std::string newName;
216 msg.read(newName);
217
218 // newName should never be empty since match was reset on the first
219 // nameOwnerChanged signal received from the service.
220 if (!_loaded && !newName.empty())
221 {
222 load();
223 }
224
225 // cancel any further notifications about the service state
226 _inventoryMatch.reset();
227 }
228
sighupHandler(sdeventplus::source::Signal &,const struct signalfd_siginfo *)229 void System::sighupHandler(sdeventplus::source::Signal&,
230 const struct signalfd_siginfo*)
231 {
232 try
233 {
234 load();
235 }
236 catch (std::runtime_error& re)
237 {
238 log<level::ERR>("Error reloading config, no config changes made",
239 entry("LOAD_ERROR=%s", re.what()));
240 }
241 }
242
243 const std::vector<CreateGroupFunction>
getTrustGroups(const json & jsonObj)244 System::getTrustGroups([[maybe_unused]] const json& jsonObj)
245 {
246 #ifdef MONITOR_USE_JSON
247 return getTrustGrps(jsonObj);
248 #else
249 return trustGroups;
250 #endif
251 }
252
setTrustMgr(const std::vector<CreateGroupFunction> & groupFuncs)253 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs)
254 {
255 _trust = std::make_unique<trust::Manager>(groupFuncs);
256 }
257
258 const std::vector<FanDefinition>
getFanDefinitions(const json & jsonObj)259 System::getFanDefinitions([[maybe_unused]] const json& jsonObj)
260 {
261 #ifdef MONITOR_USE_JSON
262 return getFanDefs(jsonObj);
263 #else
264 return fanDefinitions;
265 #endif
266 }
267
setFans(const std::vector<FanDefinition> & fanDefs)268 void System::setFans(const std::vector<FanDefinition>& fanDefs)
269 {
270 for (const auto& fanDef : fanDefs)
271 {
272 // Check if a condition exists on the fan
273 auto condition = fanDef.condition;
274 if (condition)
275 {
276 // Condition exists, skip adding fan if it fails
277 if (!(*condition)(_bus))
278 {
279 continue;
280 }
281 }
282 _fans.emplace_back(
283 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this));
284
285 updateFanHealth(*(_fans.back()));
286 }
287 }
288
289 // callback indicating a service went [on|off]line.
290 // Determine on/offline status, set all sensors for that service
291 // to new state
292 //
tachSignalOffline(sdbusplus::message_t & msg,const SensorMapType & sensorMap)293 void System::tachSignalOffline(sdbusplus::message_t& msg,
294 const SensorMapType& sensorMap)
295 {
296 std::string serviceName, oldOwner, newOwner;
297
298 msg.read(serviceName);
299 msg.read(oldOwner);
300 msg.read(newOwner);
301
302 // true if sensor server came back online, false -> went offline
303 bool hasOwner = !newOwner.empty() && oldOwner.empty();
304
305 std::string stateStr(hasOwner ? "online" : "offline");
306 getLogger().log(std::format("Changing sensors for service {} to {}",
307 serviceName, stateStr),
308 Logger::info);
309
310 auto sensorItr(sensorMap.find(serviceName));
311
312 if (sensorItr != sensorMap.end())
313 {
314 // set all sensors' owner state to not-owned
315 for (auto& sensor : sensorItr->second)
316 {
317 sensor->setOwner(hasOwner);
318 sensor->getFan().process(*sensor);
319 }
320 }
321 }
322
updateFanHealth(const Fan & fan)323 void System::updateFanHealth(const Fan& fan)
324 {
325 std::vector<bool> sensorStatus;
326 for (const auto& sensor : fan.sensors())
327 {
328 sensorStatus.push_back(sensor->functional());
329 }
330
331 _fanHealth[fan.getName()] =
332 std::make_tuple(fan.present(), std::move(sensorStatus));
333 }
334
fanStatusChange(const Fan & fan,bool skipRulesCheck)335 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck)
336 {
337 updateFanHealth(fan);
338
339 if (_powerState->isPowerOn() && !skipRulesCheck)
340 {
341 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
342 [this](auto& rule) {
343 rule->check(PowerRuleState::runtime, _fanHealth);
344 });
345 }
346 }
347
setFaultConfig(const json & jsonObj)348 void System::setFaultConfig([[maybe_unused]] const json& jsonObj)
349 {
350 #ifdef MONITOR_USE_JSON
351 std::shared_ptr<PowerInterfaceBase> powerInterface =
352 std::make_shared<PowerInterface>(_thermalAlert);
353
354 PowerOffAction::PrePowerOffFunc func =
355 std::bind(std::mem_fn(&System::logShutdownError), this);
356
357 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func);
358
359 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj);
360 #endif
361 }
362
powerStateChanged(bool powerStateOn)363 void System::powerStateChanged(bool powerStateOn)
364 {
365 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) {
366 fan->powerStateChanged(powerStateOn);
367 });
368
369 if (powerStateOn)
370 {
371 if (!_loaded)
372 {
373 log<level::ERR>("No conf file found at power on");
374 throw std::runtime_error("No conf file found at power on");
375 }
376
377 // If no fan has its sensors on D-Bus, then there is a problem
378 // with the fan controller. Log an error and shut down.
379 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
380 return fan->numSensorsOnDBusAtPowerOn() == 0;
381 }))
382 {
383 #if DELAY_HOST_CONTROL > 0
384 sleep(DELAY_HOST_CONTROL);
385 std::for_each(_fans.begin(), _fans.end(),
386 [powerStateOn](auto& fan) {
387 fan->powerStateChanged(powerStateOn);
388 });
389 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) {
390 return fan->numSensorsOnDBusAtPowerOn() == 0;
391 }))
392 {
393 handleOfflineFanController();
394 return;
395 }
396 #else
397 handleOfflineFanController();
398 return;
399 #endif
400 }
401
402 if (_sensorMatch.empty())
403 {
404 subscribeSensorsToServices();
405 }
406
407 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
408 [this](auto& rule) {
409 rule->check(PowerRuleState::atPgood, _fanHealth);
410 });
411 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
412 [this](auto& rule) {
413 rule->check(PowerRuleState::runtime, _fanHealth);
414 });
415 }
416 else
417 {
418 _thermalAlert.enabled(false);
419
420 // Cancel any in-progress power off actions
421 std::for_each(_powerOffRules.begin(), _powerOffRules.end(),
422 [this](auto& rule) { rule->cancel(); });
423 }
424 }
425
sensorErrorTimerExpired(const Fan & fan,const TachSensor & sensor)426 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor)
427 {
428 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
429
430 getLogger().log(
431 std::format("Creating event log for faulted fan {} sensor {}", fanPath,
432 sensor.name()),
433 Logger::error);
434
435 // In order to know if the event log should have a severity of error or
436 // informational, count the number of existing nonfunctional sensors and
437 // compare it to _numNonfuncSensorsBeforeError.
438 size_t nonfuncSensors = 0;
439 for (const auto& fan : _fans)
440 {
441 for (const auto& s : fan->sensors())
442 {
443 // Don't count nonfunctional sensors that still have their
444 // error timer running as nonfunctional since they haven't
445 // had event logs created for those errors yet.
446 if (!s->functional() && !s->errorTimerRunning())
447 {
448 nonfuncSensors++;
449 }
450 }
451 }
452
453 Severity severity = Severity::Error;
454 if (nonfuncSensors < _numNonfuncSensorsBeforeError)
455 {
456 severity = Severity::Informational;
457 }
458
459 auto error =
460 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault",
461 fanPath, sensor.name(), severity);
462
463 auto sensorData = captureSensorData();
464 error->commit(sensorData);
465
466 // Save the error so it can be committed again on a power off.
467 _lastError = std::move(error);
468 }
469
fanMissingErrorTimerExpired(const Fan & fan)470 void System::fanMissingErrorTimerExpired(const Fan& fan)
471 {
472 std::string fanPath{util::INVENTORY_PATH + fan.getName()};
473
474 getLogger().log(
475 std::format("Creating event log for missing fan {}", fanPath),
476 Logger::error);
477
478 auto error = std::make_unique<FanError>(
479 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error);
480
481 auto sensorData = captureSensorData();
482 error->commit(sensorData);
483
484 // Save the error so it can be committed again on a power off.
485 _lastError = std::move(error);
486 }
487
logShutdownError()488 void System::logShutdownError()
489 {
490 if (_lastError)
491 {
492 getLogger().log("Re-committing previous fan error before power off");
493
494 // Still use the latest sensor data
495 auto sensorData = captureSensorData();
496 _lastError->commit(sensorData, true);
497 }
498 }
499
captureSensorData()500 json System::captureSensorData()
501 {
502 json data;
503
504 for (const auto& fan : _fans)
505 {
506 for (const auto& sensor : fan->sensors())
507 {
508 json values;
509 values["present"] = fan->present();
510 values["functional"] = sensor->functional();
511 values["in_range"] = !fan->outOfRange(*sensor);
512 values["tach"] = sensor->getInput();
513
514 if (sensor->hasTarget())
515 {
516 values["target"] = sensor->getTarget();
517 }
518
519 // convert between string/json to remove newlines
520 values["prev_tachs"] = json(sensor->getPrevTach()).dump();
521
522 if (sensor->hasTarget())
523 {
524 values["prev_targets"] = json(sensor->getPrevTarget()).dump();
525 }
526
527 if (sensor->getMethod() == MethodMode::count)
528 {
529 values["ticks"] = sensor->getCounter();
530 }
531 data["sensors"][sensor->name()] = values;
532 }
533 }
534
535 return data;
536 }
537
handleOfflineFanController()538 void System::handleOfflineFanController()
539 {
540 getLogger().log("The fan controller appears to be offline. Shutting down.",
541 Logger::error);
542
543 auto ffdc = collectHwmonFFDC();
544
545 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline",
546 Severity::Critical};
547 error.commit(ffdc, true);
548
549 PowerInterface::executeHardPowerOff();
550
551 createBmcDump();
552 }
553
554 /**
555 * @brief Create a BMC Dump
556 */
createBmcDump() const557 void System::createBmcDump() const
558 {
559 try
560 {
561 util::SDBusPlus::callMethod(
562 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc",
563 "xyz.openbmc_project.Dump.Create", "CreateDump",
564 std::vector<
565 std::pair<std::string, std::variant<std::string, uint64_t>>>());
566 }
567 catch (const std::exception& e)
568 {
569 getLogger().log(
570 std::format("Caught exception while creating BMC dump: {}",
571 e.what()),
572 Logger::error);
573 }
574 }
575
dumpDebugData(sdeventplus::source::Signal &,const struct signalfd_siginfo *)576 void System::dumpDebugData(sdeventplus::source::Signal&,
577 const struct signalfd_siginfo*)
578 {
579 json output;
580
581 if (_loaded)
582 {
583 output["logs"] = getLogger().getLogs();
584 output["sensors"] = captureSensorData();
585 }
586 else
587 {
588 output["error"] = "Fan monitor not loaded yet. Try again later.";
589 }
590
591 std::ofstream file{System::dumpFile};
592 if (!file)
593 {
594 log<level::ERR>("Could not open file for fan monitor dump");
595 }
596 else
597 {
598 file << std::setw(4) << output;
599 }
600 }
601
602 } // namespace phosphor::fan::monitor
603