1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "dbus_paths.hpp" 19 #include "fan.hpp" 20 #include "fan_defs.hpp" 21 #include "tach_sensor.hpp" 22 #include "trust_manager.hpp" 23 #include "types.hpp" 24 #include "utility.hpp" 25 #ifdef MONITOR_USE_JSON 26 #include "json_config.hpp" 27 #include "json_parser.hpp" 28 #endif 29 30 #include "config.h" 31 32 #include "hwmon_ffdc.hpp" 33 34 #include <nlohmann/json.hpp> 35 #include <phosphor-logging/log.hpp> 36 #include <sdbusplus/bus.hpp> 37 #include <sdbusplus/bus/match.hpp> 38 #include <sdeventplus/event.hpp> 39 #include <sdeventplus/source/signal.hpp> 40 41 namespace phosphor::fan::monitor 42 { 43 44 using json = nlohmann::json; 45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 46 47 using namespace phosphor::logging; 48 49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json"; 50 51 System::System(Mode mode, sdbusplus::bus_t& bus, 52 const sdeventplus::Event& event) : 53 _mode(mode), 54 _bus(bus), _event(event), 55 #ifdef MONITOR_USE_HOST_STATE 56 _powerState(std::make_unique<HostPowerState>( 57 #else 58 _powerState(std::make_unique<PGoodState>( 59 #endif 60 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 61 std::placeholders::_1))), 62 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 63 {} 64 65 void System::start() 66 { 67 namespace match = sdbusplus::bus::match; 68 69 // must be done before service detection 70 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>( 71 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 72 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 73 74 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 75 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 76 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 77 78 if (invServiceRunning) 79 { 80 _inventoryMatch.reset(); 81 82 if (!_loaded) 83 { 84 load(); 85 } 86 } 87 } 88 89 void System::load() 90 { 91 json jsonObj = json::object(); 92 #ifdef MONITOR_USE_JSON 93 try 94 { 95 jsonObj = getJsonObj(); 96 #endif 97 auto trustGrps = getTrustGroups(jsonObj); 98 auto fanDefs = getFanDefinitions(jsonObj); 99 // Retrieve and set trust groups within the trust manager 100 setTrustMgr(getTrustGroups(jsonObj)); 101 // Clear/set configured fan definitions 102 _fans.clear(); 103 _fanHealth.clear(); 104 // Retrieve fan definitions and create fan objects to be monitored 105 setFans(fanDefs); 106 setFaultConfig(jsonObj); 107 log<level::INFO>("Configuration loaded"); 108 109 _loaded = true; 110 #ifdef MONITOR_USE_JSON 111 } 112 catch (const phosphor::fan::NoConfigFound&) 113 {} 114 #endif 115 116 if (_powerState->isPowerOn()) 117 { 118 // Fans could be missing on startup, so check the power off rules. 119 // Tach sensors default to functional, so they wouldn't cause a power 120 // off here. 121 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 122 [this](auto& rule) { 123 rule->check(PowerRuleState::runtime, _fanHealth); 124 }); 125 } 126 127 subscribeSensorsToServices(); 128 } 129 130 void System::subscribeSensorsToServices() 131 { 132 namespace match = sdbusplus::bus::match; 133 134 _sensorMatch.clear(); 135 136 SensorMapType sensorMap; 137 138 // build a list of all interfaces, always including the value interface 139 // using set automatically guards against duplicates 140 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 141 142 for (const auto& fan : _fans) 143 { 144 for (const auto& sensor : fan->sensors()) 145 { 146 unique_interfaces.insert(sensor->getInterface()); 147 } 148 } 149 // convert them to vector to pass into getSubTreeRaw 150 std::vector<std::string> interfaces(unique_interfaces.begin(), 151 unique_interfaces.end()); 152 153 try 154 { 155 // get service information for all service names that are 156 // hosting these interfaces 157 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 158 _bus, FAN_SENSOR_PATH, interfaces, 0); 159 160 for (const auto& fan : _fans) 161 { 162 // For every sensor in each fan 163 for (const auto& sensor : fan->sensors()) 164 { 165 const auto itServ = serviceObjects.find(sensor->name()); 166 167 if (serviceObjects.end() == itServ || itServ->second.empty()) 168 { 169 getLogger().log( 170 fmt::format("Fan sensor entry {} not found in D-Bus", 171 sensor->name()), 172 Logger::error); 173 continue; 174 } 175 176 for (const auto& [serviceName, unused] : itServ->second) 177 { 178 // associate service name with sensor 179 sensorMap[serviceName].insert(sensor); 180 } 181 } 182 } 183 184 // only create 1 match per service 185 for (const auto& [serviceName, unused] : sensorMap) 186 { 187 // map its service name to the sensor 188 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>( 189 _bus, match::rules::nameOwnerChanged(serviceName), 190 std::bind(&System::tachSignalOffline, this, 191 std::placeholders::_1, sensorMap))); 192 } 193 } 194 catch (const util::DBusError&) 195 { 196 // catch exception from getSubTreeRaw() when fan sensor paths don't 197 // exist yet 198 } 199 } 200 201 void System::inventoryOnlineCb(sdbusplus::message_t& msg) 202 { 203 namespace match = sdbusplus::bus::match; 204 205 std::string iface; 206 msg.read(iface); 207 208 if (util::INVENTORY_INTF != iface) 209 { 210 return; 211 } 212 213 std::string oldName; 214 msg.read(oldName); 215 216 std::string newName; 217 msg.read(newName); 218 219 // newName should never be empty since match was reset on the first 220 // nameOwnerChanged signal received from the service. 221 if (!_loaded && !newName.empty()) 222 { 223 load(); 224 } 225 226 // cancel any further notifications about the service state 227 _inventoryMatch.reset(); 228 } 229 230 void System::sighupHandler(sdeventplus::source::Signal&, 231 const struct signalfd_siginfo*) 232 { 233 try 234 { 235 load(); 236 } 237 catch (std::runtime_error& re) 238 { 239 log<level::ERR>("Error reloading config, no config changes made", 240 entry("LOAD_ERROR=%s", re.what())); 241 } 242 } 243 244 const std::vector<CreateGroupFunction> 245 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 246 { 247 #ifdef MONITOR_USE_JSON 248 return getTrustGrps(jsonObj); 249 #else 250 return trustGroups; 251 #endif 252 } 253 254 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 255 { 256 _trust = std::make_unique<trust::Manager>(groupFuncs); 257 } 258 259 const std::vector<FanDefinition> 260 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 261 { 262 #ifdef MONITOR_USE_JSON 263 return getFanDefs(jsonObj); 264 #else 265 return fanDefinitions; 266 #endif 267 } 268 269 void System::setFans(const std::vector<FanDefinition>& fanDefs) 270 { 271 for (const auto& fanDef : fanDefs) 272 { 273 // Check if a condition exists on the fan 274 auto condition = fanDef.condition; 275 if (condition) 276 { 277 // Condition exists, skip adding fan if it fails 278 if (!(*condition)(_bus)) 279 { 280 continue; 281 } 282 } 283 _fans.emplace_back( 284 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 285 286 updateFanHealth(*(_fans.back())); 287 } 288 } 289 290 // callback indicating a service went [on|off]line. 291 // Determine on/offline status, set all sensors for that service 292 // to new state 293 // 294 void System::tachSignalOffline(sdbusplus::message_t& msg, 295 const SensorMapType& sensorMap) 296 { 297 std::string serviceName, oldOwner, newOwner; 298 299 msg.read(serviceName); 300 msg.read(oldOwner); 301 msg.read(newOwner); 302 303 // true if sensor server came back online, false -> went offline 304 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 305 306 std::string stateStr(hasOwner ? "online" : "offline"); 307 getLogger().log(fmt::format("Changing sensors for service {} to {}", 308 serviceName, stateStr), 309 Logger::info); 310 311 auto sensorItr(sensorMap.find(serviceName)); 312 313 if (sensorItr != sensorMap.end()) 314 { 315 // set all sensors' owner state to not-owned 316 for (auto& sensor : sensorItr->second) 317 { 318 sensor->setOwner(hasOwner); 319 sensor->getFan().process(*sensor); 320 } 321 } 322 } 323 324 void System::updateFanHealth(const Fan& fan) 325 { 326 std::vector<bool> sensorStatus; 327 for (const auto& sensor : fan.sensors()) 328 { 329 sensorStatus.push_back(sensor->functional()); 330 } 331 332 _fanHealth[fan.getName()] = std::make_tuple(fan.present(), 333 std::move(sensorStatus)); 334 } 335 336 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 337 { 338 updateFanHealth(fan); 339 340 if (_powerState->isPowerOn() && !skipRulesCheck) 341 { 342 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 343 [this](auto& rule) { 344 rule->check(PowerRuleState::runtime, _fanHealth); 345 }); 346 } 347 } 348 349 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 350 { 351 #ifdef MONITOR_USE_JSON 352 std::shared_ptr<PowerInterfaceBase> powerInterface = 353 std::make_shared<PowerInterface>(_thermalAlert); 354 355 PowerOffAction::PrePowerOffFunc func = 356 std::bind(std::mem_fn(&System::logShutdownError), this); 357 358 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 359 360 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 361 #endif 362 } 363 364 void System::powerStateChanged(bool powerStateOn) 365 { 366 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 367 fan->powerStateChanged(powerStateOn); 368 }); 369 370 if (powerStateOn) 371 { 372 if (!_loaded) 373 { 374 log<level::ERR>("No conf file found at power on"); 375 throw std::runtime_error("No conf file found at power on"); 376 } 377 378 // If no fan has its sensors on D-Bus, then there is a problem 379 // with the fan controller. Log an error and shut down. 380 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 381 return fan->numSensorsOnDBusAtPowerOn() == 0; 382 })) 383 { 384 #if DELAY_HOST_CONTROL > 0 385 sleep(DELAY_HOST_CONTROL); 386 std::for_each(_fans.begin(), _fans.end(), 387 [powerStateOn](auto& fan) { 388 fan->powerStateChanged(powerStateOn); 389 }); 390 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 391 return fan->numSensorsOnDBusAtPowerOn() == 0; 392 })) 393 { 394 handleOfflineFanController(); 395 return; 396 } 397 #else 398 handleOfflineFanController(); 399 return; 400 #endif 401 } 402 403 if (_sensorMatch.empty()) 404 { 405 subscribeSensorsToServices(); 406 } 407 408 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 409 [this](auto& rule) { 410 rule->check(PowerRuleState::atPgood, _fanHealth); 411 }); 412 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 413 [this](auto& rule) { 414 rule->check(PowerRuleState::runtime, _fanHealth); 415 }); 416 } 417 else 418 { 419 _thermalAlert.enabled(false); 420 421 // Cancel any in-progress power off actions 422 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 423 [this](auto& rule) { rule->cancel(); }); 424 } 425 } 426 427 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 428 { 429 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 430 431 getLogger().log( 432 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 433 sensor.name()), 434 Logger::error); 435 436 // In order to know if the event log should have a severity of error or 437 // informational, count the number of existing nonfunctional sensors and 438 // compare it to _numNonfuncSensorsBeforeError. 439 size_t nonfuncSensors = 0; 440 for (const auto& fan : _fans) 441 { 442 for (const auto& s : fan->sensors()) 443 { 444 // Don't count nonfunctional sensors that still have their 445 // error timer running as nonfunctional since they haven't 446 // had event logs created for those errors yet. 447 if (!s->functional() && !s->errorTimerRunning()) 448 { 449 nonfuncSensors++; 450 } 451 } 452 } 453 454 Severity severity = Severity::Error; 455 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 456 { 457 severity = Severity::Informational; 458 } 459 460 auto error = 461 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 462 fanPath, sensor.name(), severity); 463 464 auto sensorData = captureSensorData(); 465 error->commit(sensorData); 466 467 // Save the error so it can be committed again on a power off. 468 _lastError = std::move(error); 469 } 470 471 void System::fanMissingErrorTimerExpired(const Fan& fan) 472 { 473 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 474 475 getLogger().log( 476 fmt::format("Creating event log for missing fan {}", fanPath), 477 Logger::error); 478 479 auto error = std::make_unique<FanError>( 480 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 481 482 auto sensorData = captureSensorData(); 483 error->commit(sensorData); 484 485 // Save the error so it can be committed again on a power off. 486 _lastError = std::move(error); 487 } 488 489 void System::logShutdownError() 490 { 491 if (_lastError) 492 { 493 getLogger().log("Re-committing previous fan error before power off"); 494 495 // Still use the latest sensor data 496 auto sensorData = captureSensorData(); 497 _lastError->commit(sensorData, true); 498 } 499 } 500 501 json System::captureSensorData() 502 { 503 json data; 504 505 for (const auto& fan : _fans) 506 { 507 for (const auto& sensor : fan->sensors()) 508 { 509 json values; 510 values["present"] = fan->present(); 511 values["functional"] = sensor->functional(); 512 values["in_range"] = !fan->outOfRange(*sensor); 513 values["tach"] = sensor->getInput(); 514 515 if (sensor->hasTarget()) 516 { 517 values["target"] = sensor->getTarget(); 518 } 519 520 // convert between string/json to remove newlines 521 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 522 523 if (sensor->hasTarget()) 524 { 525 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 526 } 527 528 if (sensor->getMethod() == MethodMode::count) 529 { 530 values["ticks"] = sensor->getCounter(); 531 } 532 data["sensors"][sensor->name()] = values; 533 } 534 } 535 536 return data; 537 } 538 539 void System::handleOfflineFanController() 540 { 541 getLogger().log("The fan controller appears to be offline. Shutting down.", 542 Logger::error); 543 544 auto ffdc = collectHwmonFFDC(); 545 546 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 547 Severity::Critical}; 548 error.commit(ffdc, true); 549 550 PowerInterface::executeHardPowerOff(); 551 552 createBmcDump(); 553 } 554 555 /** 556 * @brief Create a BMC Dump 557 */ 558 void System::createBmcDump() const 559 { 560 try 561 { 562 util::SDBusPlus::callMethod( 563 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 564 "xyz.openbmc_project.Dump.Create", "CreateDump", 565 std::vector< 566 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 567 } 568 catch (const std::exception& e) 569 { 570 getLogger().log( 571 fmt::format("Caught exception while creating BMC dump: {}", 572 e.what()), 573 Logger::error); 574 } 575 } 576 577 void System::dumpDebugData(sdeventplus::source::Signal&, 578 const struct signalfd_siginfo*) 579 { 580 json output; 581 582 if (_loaded) 583 { 584 output["logs"] = getLogger().getLogs(); 585 output["sensors"] = captureSensorData(); 586 } 587 else 588 { 589 output["error"] = "Fan monitor not loaded yet. Try again later."; 590 } 591 592 std::ofstream file{System::dumpFile}; 593 if (!file) 594 { 595 log<level::ERR>("Could not open file for fan monitor dump"); 596 } 597 else 598 { 599 file << std::setw(4) << output; 600 } 601 } 602 603 } // namespace phosphor::fan::monitor 604