1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "dbus_paths.hpp" 19 #include "fan.hpp" 20 #include "fan_defs.hpp" 21 #include "tach_sensor.hpp" 22 #include "trust_manager.hpp" 23 #include "types.hpp" 24 #include "utility.hpp" 25 #ifdef MONITOR_USE_JSON 26 #include "json_config.hpp" 27 #include "json_parser.hpp" 28 #endif 29 30 #include "config.h" 31 32 #include "hwmon_ffdc.hpp" 33 34 #include <nlohmann/json.hpp> 35 #include <phosphor-logging/log.hpp> 36 #include <sdbusplus/bus.hpp> 37 #include <sdbusplus/bus/match.hpp> 38 #include <sdeventplus/event.hpp> 39 #include <sdeventplus/source/signal.hpp> 40 41 namespace phosphor::fan::monitor 42 { 43 44 using json = nlohmann::json; 45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 46 47 using namespace phosphor::logging; 48 49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json"; 50 51 System::System(Mode mode, sdbusplus::bus_t& bus, 52 const sdeventplus::Event& event) : 53 _mode(mode), _bus(bus), _event(event), 54 #ifdef MONITOR_USE_HOST_STATE 55 _powerState(std::make_unique<HostPowerState>( 56 #else 57 _powerState(std::make_unique<PGoodState>( 58 #endif 59 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 60 std::placeholders::_1))), 61 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 62 {} 63 64 void System::start() 65 { 66 namespace match = sdbusplus::bus::match; 67 68 // must be done before service detection 69 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>( 70 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 71 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 72 73 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 74 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 75 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 76 77 if (invServiceRunning) 78 { 79 _inventoryMatch.reset(); 80 81 if (!_loaded) 82 { 83 load(); 84 } 85 } 86 } 87 88 void System::load() 89 { 90 json jsonObj = json::object(); 91 #ifdef MONITOR_USE_JSON 92 try 93 { 94 jsonObj = getJsonObj(); 95 #endif 96 auto trustGrps = getTrustGroups(jsonObj); 97 auto fanDefs = getFanDefinitions(jsonObj); 98 // Retrieve and set trust groups within the trust manager 99 setTrustMgr(getTrustGroups(jsonObj)); 100 // Clear/set configured fan definitions 101 _fans.clear(); 102 _fanHealth.clear(); 103 // Retrieve fan definitions and create fan objects to be monitored 104 setFans(fanDefs); 105 setFaultConfig(jsonObj); 106 log<level::INFO>("Configuration loaded"); 107 108 _loaded = true; 109 #ifdef MONITOR_USE_JSON 110 } 111 catch (const phosphor::fan::NoConfigFound&) 112 {} 113 #endif 114 115 if (_powerState->isPowerOn()) 116 { 117 // Fans could be missing on startup, so check the power off rules. 118 // Tach sensors default to functional, so they wouldn't cause a power 119 // off here. 120 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 121 [this](auto& rule) { 122 rule->check(PowerRuleState::runtime, _fanHealth); 123 }); 124 } 125 126 subscribeSensorsToServices(); 127 } 128 129 void System::subscribeSensorsToServices() 130 { 131 namespace match = sdbusplus::bus::match; 132 133 _sensorMatch.clear(); 134 135 SensorMapType sensorMap; 136 137 // build a list of all interfaces, always including the value interface 138 // using set automatically guards against duplicates 139 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 140 141 for (const auto& fan : _fans) 142 { 143 for (const auto& sensor : fan->sensors()) 144 { 145 unique_interfaces.insert(sensor->getInterface()); 146 } 147 } 148 // convert them to vector to pass into getSubTreeRaw 149 std::vector<std::string> interfaces(unique_interfaces.begin(), 150 unique_interfaces.end()); 151 152 try 153 { 154 // get service information for all service names that are 155 // hosting these interfaces 156 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 157 _bus, FAN_SENSOR_PATH, interfaces, 0); 158 159 for (const auto& fan : _fans) 160 { 161 // For every sensor in each fan 162 for (const auto& sensor : fan->sensors()) 163 { 164 const auto itServ = serviceObjects.find(sensor->name()); 165 166 if (serviceObjects.end() == itServ || itServ->second.empty()) 167 { 168 getLogger().log( 169 std::format("Fan sensor entry {} not found in D-Bus", 170 sensor->name()), 171 Logger::error); 172 continue; 173 } 174 175 for (const auto& [serviceName, unused] : itServ->second) 176 { 177 // associate service name with sensor 178 sensorMap[serviceName].insert(sensor); 179 } 180 } 181 } 182 183 // only create 1 match per service 184 for (const auto& [serviceName, unused] : sensorMap) 185 { 186 // map its service name to the sensor 187 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>( 188 _bus, match::rules::nameOwnerChanged(serviceName), 189 std::bind(&System::tachSignalOffline, this, 190 std::placeholders::_1, sensorMap))); 191 } 192 } 193 catch (const util::DBusError&) 194 { 195 // catch exception from getSubTreeRaw() when fan sensor paths don't 196 // exist yet 197 } 198 } 199 200 void System::inventoryOnlineCb(sdbusplus::message_t& msg) 201 { 202 namespace match = sdbusplus::bus::match; 203 204 std::string iface; 205 msg.read(iface); 206 207 if (util::INVENTORY_INTF != iface) 208 { 209 return; 210 } 211 212 std::string oldName; 213 msg.read(oldName); 214 215 std::string newName; 216 msg.read(newName); 217 218 // newName should never be empty since match was reset on the first 219 // nameOwnerChanged signal received from the service. 220 if (!_loaded && !newName.empty()) 221 { 222 load(); 223 } 224 225 // cancel any further notifications about the service state 226 _inventoryMatch.reset(); 227 } 228 229 void System::sighupHandler(sdeventplus::source::Signal&, 230 const struct signalfd_siginfo*) 231 { 232 try 233 { 234 load(); 235 } 236 catch (std::runtime_error& re) 237 { 238 log<level::ERR>("Error reloading config, no config changes made", 239 entry("LOAD_ERROR=%s", re.what())); 240 } 241 } 242 243 const std::vector<CreateGroupFunction> 244 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 245 { 246 #ifdef MONITOR_USE_JSON 247 return getTrustGrps(jsonObj); 248 #else 249 return trustGroups; 250 #endif 251 } 252 253 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 254 { 255 _trust = std::make_unique<trust::Manager>(groupFuncs); 256 } 257 258 const std::vector<FanDefinition> 259 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 260 { 261 #ifdef MONITOR_USE_JSON 262 return getFanDefs(jsonObj); 263 #else 264 return fanDefinitions; 265 #endif 266 } 267 268 void System::setFans(const std::vector<FanDefinition>& fanDefs) 269 { 270 for (const auto& fanDef : fanDefs) 271 { 272 // Check if a condition exists on the fan 273 auto condition = fanDef.condition; 274 if (condition) 275 { 276 // Condition exists, skip adding fan if it fails 277 if (!(*condition)(_bus)) 278 { 279 continue; 280 } 281 } 282 _fans.emplace_back( 283 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 284 285 updateFanHealth(*(_fans.back())); 286 } 287 } 288 289 // callback indicating a service went [on|off]line. 290 // Determine on/offline status, set all sensors for that service 291 // to new state 292 // 293 void System::tachSignalOffline(sdbusplus::message_t& msg, 294 const SensorMapType& sensorMap) 295 { 296 std::string serviceName, oldOwner, newOwner; 297 298 msg.read(serviceName); 299 msg.read(oldOwner); 300 msg.read(newOwner); 301 302 // true if sensor server came back online, false -> went offline 303 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 304 305 std::string stateStr(hasOwner ? "online" : "offline"); 306 getLogger().log(std::format("Changing sensors for service {} to {}", 307 serviceName, stateStr), 308 Logger::info); 309 310 auto sensorItr(sensorMap.find(serviceName)); 311 312 if (sensorItr != sensorMap.end()) 313 { 314 // set all sensors' owner state to not-owned 315 for (auto& sensor : sensorItr->second) 316 { 317 sensor->setOwner(hasOwner); 318 sensor->getFan().process(*sensor); 319 } 320 } 321 } 322 323 void System::updateFanHealth(const Fan& fan) 324 { 325 std::vector<bool> sensorStatus; 326 for (const auto& sensor : fan.sensors()) 327 { 328 sensorStatus.push_back(sensor->functional()); 329 } 330 331 _fanHealth[fan.getName()] = 332 std::make_tuple(fan.present(), std::move(sensorStatus)); 333 } 334 335 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 336 { 337 updateFanHealth(fan); 338 339 if (_powerState->isPowerOn() && !skipRulesCheck) 340 { 341 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 342 [this](auto& rule) { 343 rule->check(PowerRuleState::runtime, _fanHealth); 344 }); 345 } 346 } 347 348 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 349 { 350 #ifdef MONITOR_USE_JSON 351 std::shared_ptr<PowerInterfaceBase> powerInterface = 352 std::make_shared<PowerInterface>(_thermalAlert); 353 354 PowerOffAction::PrePowerOffFunc func = 355 std::bind(std::mem_fn(&System::logShutdownError), this); 356 357 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 358 359 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 360 #endif 361 } 362 363 void System::powerStateChanged(bool powerStateOn) 364 { 365 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 366 fan->powerStateChanged(powerStateOn); 367 }); 368 369 if (powerStateOn) 370 { 371 if (!_loaded) 372 { 373 log<level::ERR>("No conf file found at power on"); 374 throw std::runtime_error("No conf file found at power on"); 375 } 376 377 // If no fan has its sensors on D-Bus, then there is a problem 378 // with the fan controller. Log an error and shut down. 379 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 380 return fan->numSensorsOnDBusAtPowerOn() == 0; 381 })) 382 { 383 #if DELAY_HOST_CONTROL > 0 384 sleep(DELAY_HOST_CONTROL); 385 std::for_each(_fans.begin(), _fans.end(), 386 [powerStateOn](auto& fan) { 387 fan->powerStateChanged(powerStateOn); 388 }); 389 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 390 return fan->numSensorsOnDBusAtPowerOn() == 0; 391 })) 392 { 393 handleOfflineFanController(); 394 return; 395 } 396 #else 397 handleOfflineFanController(); 398 return; 399 #endif 400 } 401 402 if (_sensorMatch.empty()) 403 { 404 subscribeSensorsToServices(); 405 } 406 407 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 408 [this](auto& rule) { 409 rule->check(PowerRuleState::atPgood, _fanHealth); 410 }); 411 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 412 [this](auto& rule) { 413 rule->check(PowerRuleState::runtime, _fanHealth); 414 }); 415 } 416 else 417 { 418 _thermalAlert.enabled(false); 419 420 // Cancel any in-progress power off actions 421 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 422 [this](auto& rule) { rule->cancel(); }); 423 } 424 } 425 426 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 427 { 428 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 429 430 getLogger().log( 431 std::format("Creating event log for faulted fan {} sensor {}", fanPath, 432 sensor.name()), 433 Logger::error); 434 435 // In order to know if the event log should have a severity of error or 436 // informational, count the number of existing nonfunctional sensors and 437 // compare it to _numNonfuncSensorsBeforeError. 438 size_t nonfuncSensors = 0; 439 for (const auto& fan : _fans) 440 { 441 for (const auto& s : fan->sensors()) 442 { 443 // Don't count nonfunctional sensors that still have their 444 // error timer running as nonfunctional since they haven't 445 // had event logs created for those errors yet. 446 if (!s->functional() && !s->errorTimerRunning()) 447 { 448 nonfuncSensors++; 449 } 450 } 451 } 452 453 Severity severity = Severity::Error; 454 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 455 { 456 severity = Severity::Informational; 457 } 458 459 auto error = 460 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 461 fanPath, sensor.name(), severity); 462 463 auto sensorData = captureSensorData(); 464 error->commit(sensorData); 465 466 // Save the error so it can be committed again on a power off. 467 _lastError = std::move(error); 468 } 469 470 void System::fanMissingErrorTimerExpired(const Fan& fan) 471 { 472 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 473 474 getLogger().log( 475 std::format("Creating event log for missing fan {}", fanPath), 476 Logger::error); 477 478 auto error = std::make_unique<FanError>( 479 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 480 481 auto sensorData = captureSensorData(); 482 error->commit(sensorData); 483 484 // Save the error so it can be committed again on a power off. 485 _lastError = std::move(error); 486 } 487 488 void System::logShutdownError() 489 { 490 if (_lastError) 491 { 492 getLogger().log("Re-committing previous fan error before power off"); 493 494 // Still use the latest sensor data 495 auto sensorData = captureSensorData(); 496 _lastError->commit(sensorData, true); 497 } 498 } 499 500 json System::captureSensorData() 501 { 502 json data; 503 504 for (const auto& fan : _fans) 505 { 506 for (const auto& sensor : fan->sensors()) 507 { 508 json values; 509 values["present"] = fan->present(); 510 values["functional"] = sensor->functional(); 511 values["in_range"] = !fan->outOfRange(*sensor); 512 values["tach"] = sensor->getInput(); 513 514 if (sensor->hasTarget()) 515 { 516 values["target"] = sensor->getTarget(); 517 } 518 519 // convert between string/json to remove newlines 520 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 521 522 if (sensor->hasTarget()) 523 { 524 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 525 } 526 527 if (sensor->getMethod() == MethodMode::count) 528 { 529 values["ticks"] = sensor->getCounter(); 530 } 531 data["sensors"][sensor->name()] = values; 532 } 533 } 534 535 return data; 536 } 537 538 void System::handleOfflineFanController() 539 { 540 getLogger().log("The fan controller appears to be offline. Shutting down.", 541 Logger::error); 542 543 auto ffdc = collectHwmonFFDC(); 544 545 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 546 Severity::Critical}; 547 error.commit(ffdc, true); 548 549 PowerInterface::executeHardPowerOff(); 550 551 createBmcDump(); 552 } 553 554 /** 555 * @brief Create a BMC Dump 556 */ 557 void System::createBmcDump() const 558 { 559 try 560 { 561 util::SDBusPlus::callMethod( 562 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 563 "xyz.openbmc_project.Dump.Create", "CreateDump", 564 std::vector< 565 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 566 } 567 catch (const std::exception& e) 568 { 569 getLogger().log( 570 std::format("Caught exception while creating BMC dump: {}", 571 e.what()), 572 Logger::error); 573 } 574 } 575 576 void System::dumpDebugData(sdeventplus::source::Signal&, 577 const struct signalfd_siginfo*) 578 { 579 json output; 580 581 if (_loaded) 582 { 583 output["logs"] = getLogger().getLogs(); 584 output["sensors"] = captureSensorData(); 585 } 586 else 587 { 588 output["error"] = "Fan monitor not loaded yet. Try again later."; 589 } 590 591 std::ofstream file{System::dumpFile}; 592 if (!file) 593 { 594 log<level::ERR>("Could not open file for fan monitor dump"); 595 } 596 else 597 { 598 file << std::setw(4) << output; 599 } 600 } 601 602 } // namespace phosphor::fan::monitor 603