1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "dbus_paths.hpp" 19 #include "fan.hpp" 20 #include "fan_defs.hpp" 21 #include "tach_sensor.hpp" 22 #include "trust_manager.hpp" 23 #include "types.hpp" 24 #include "utility.hpp" 25 #ifdef MONITOR_USE_JSON 26 #include "json_config.hpp" 27 #include "json_parser.hpp" 28 #endif 29 30 #include "config.h" 31 32 #include "hwmon_ffdc.hpp" 33 34 #include <nlohmann/json.hpp> 35 #include <phosphor-logging/log.hpp> 36 #include <sdbusplus/bus.hpp> 37 #include <sdbusplus/bus/match.hpp> 38 #include <sdeventplus/event.hpp> 39 #include <sdeventplus/source/signal.hpp> 40 41 namespace phosphor::fan::monitor 42 { 43 44 using json = nlohmann::json; 45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 46 47 using namespace phosphor::logging; 48 49 const std::string System::dumpFile = "/tmp/fan_monitor_dump.json"; 50 51 System::System(Mode mode, sdbusplus::bus_t& bus, 52 const sdeventplus::Event& event) : 53 _mode(mode), 54 _bus(bus), _event(event), 55 _powerState(std::make_unique<PGoodState>( 56 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 57 std::placeholders::_1))), 58 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 59 {} 60 61 void System::start() 62 { 63 namespace match = sdbusplus::bus::match; 64 65 // must be done before service detection 66 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>( 67 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 68 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 69 70 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 71 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 72 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 73 74 if (invServiceRunning) 75 { 76 _inventoryMatch.reset(); 77 78 if (!_loaded) 79 { 80 load(); 81 } 82 } 83 } 84 85 void System::load() 86 { 87 json jsonObj = json::object(); 88 #ifdef MONITOR_USE_JSON 89 try 90 { 91 jsonObj = getJsonObj(); 92 #endif 93 auto trustGrps = getTrustGroups(jsonObj); 94 auto fanDefs = getFanDefinitions(jsonObj); 95 // Retrieve and set trust groups within the trust manager 96 setTrustMgr(getTrustGroups(jsonObj)); 97 // Clear/set configured fan definitions 98 _fans.clear(); 99 _fanHealth.clear(); 100 // Retrieve fan definitions and create fan objects to be monitored 101 setFans(fanDefs); 102 setFaultConfig(jsonObj); 103 log<level::INFO>("Configuration loaded"); 104 105 _loaded = true; 106 #ifdef MONITOR_USE_JSON 107 } 108 catch (const phosphor::fan::NoConfigFound&) 109 {} 110 #endif 111 112 if (_powerState->isPowerOn()) 113 { 114 // Fans could be missing on startup, so check the power off rules. 115 // Tach sensors default to functional, so they wouldn't cause a power 116 // off here. 117 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 118 [this](auto& rule) { 119 rule->check(PowerRuleState::runtime, _fanHealth); 120 }); 121 } 122 123 subscribeSensorsToServices(); 124 } 125 126 void System::subscribeSensorsToServices() 127 { 128 namespace match = sdbusplus::bus::match; 129 130 _sensorMatch.clear(); 131 132 SensorMapType sensorMap; 133 134 // build a list of all interfaces, always including the value interface 135 // using set automatically guards against duplicates 136 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 137 138 for (const auto& fan : _fans) 139 { 140 for (const auto& sensor : fan->sensors()) 141 { 142 unique_interfaces.insert(sensor->getInterface()); 143 } 144 } 145 // convert them to vector to pass into getSubTreeRaw 146 std::vector<std::string> interfaces(unique_interfaces.begin(), 147 unique_interfaces.end()); 148 149 try 150 { 151 // get service information for all service names that are 152 // hosting these interfaces 153 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 154 _bus, FAN_SENSOR_PATH, interfaces, 0); 155 156 for (const auto& fan : _fans) 157 { 158 // For every sensor in each fan 159 for (const auto& sensor : fan->sensors()) 160 { 161 const auto itServ = serviceObjects.find(sensor->name()); 162 163 if (serviceObjects.end() == itServ || itServ->second.empty()) 164 { 165 getLogger().log( 166 fmt::format("Fan sensor entry {} not found in D-Bus", 167 sensor->name()), 168 Logger::error); 169 continue; 170 } 171 172 for (const auto& [serviceName, unused] : itServ->second) 173 { 174 // associate service name with sensor 175 sensorMap[serviceName].insert(sensor); 176 } 177 } 178 } 179 180 // only create 1 match per service 181 for (const auto& [serviceName, unused] : sensorMap) 182 { 183 // map its service name to the sensor 184 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>( 185 _bus, match::rules::nameOwnerChanged(serviceName), 186 std::bind(&System::tachSignalOffline, this, 187 std::placeholders::_1, sensorMap))); 188 } 189 } 190 catch (const util::DBusError&) 191 { 192 // catch exception from getSubTreeRaw() when fan sensor paths don't 193 // exist yet 194 } 195 } 196 197 void System::inventoryOnlineCb(sdbusplus::message_t& msg) 198 { 199 namespace match = sdbusplus::bus::match; 200 201 std::string iface; 202 msg.read(iface); 203 204 if (util::INVENTORY_INTF != iface) 205 { 206 return; 207 } 208 209 std::string oldName; 210 msg.read(oldName); 211 212 std::string newName; 213 msg.read(newName); 214 215 // newName should never be empty since match was reset on the first 216 // nameOwnerChanged signal received from the service. 217 if (!_loaded && !newName.empty()) 218 { 219 load(); 220 } 221 222 // cancel any further notifications about the service state 223 _inventoryMatch.reset(); 224 } 225 226 void System::sighupHandler(sdeventplus::source::Signal&, 227 const struct signalfd_siginfo*) 228 { 229 try 230 { 231 load(); 232 } 233 catch (std::runtime_error& re) 234 { 235 log<level::ERR>("Error reloading config, no config changes made", 236 entry("LOAD_ERROR=%s", re.what())); 237 } 238 } 239 240 const std::vector<CreateGroupFunction> 241 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 242 { 243 #ifdef MONITOR_USE_JSON 244 return getTrustGrps(jsonObj); 245 #else 246 return trustGroups; 247 #endif 248 } 249 250 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 251 { 252 _trust = std::make_unique<trust::Manager>(groupFuncs); 253 } 254 255 const std::vector<FanDefinition> 256 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 257 { 258 #ifdef MONITOR_USE_JSON 259 return getFanDefs(jsonObj); 260 #else 261 return fanDefinitions; 262 #endif 263 } 264 265 void System::setFans(const std::vector<FanDefinition>& fanDefs) 266 { 267 for (const auto& fanDef : fanDefs) 268 { 269 // Check if a condition exists on the fan 270 auto condition = std::get<conditionField>(fanDef); 271 if (condition) 272 { 273 // Condition exists, skip adding fan if it fails 274 if (!(*condition)(_bus)) 275 { 276 continue; 277 } 278 } 279 _fans.emplace_back( 280 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 281 282 updateFanHealth(*(_fans.back())); 283 } 284 } 285 286 // callback indicating a service went [on|off]line. 287 // Determine on/offline status, set all sensors for that service 288 // to new state 289 // 290 void System::tachSignalOffline(sdbusplus::message_t& msg, 291 SensorMapType const& sensorMap) 292 { 293 std::string serviceName, oldOwner, newOwner; 294 295 msg.read(serviceName); 296 msg.read(oldOwner); 297 msg.read(newOwner); 298 299 // true if sensor server came back online, false -> went offline 300 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 301 302 std::string stateStr(hasOwner ? "online" : "offline"); 303 getLogger().log(fmt::format("Changing sensors for service {} to {}", 304 serviceName, stateStr), 305 Logger::info); 306 307 auto sensorItr(sensorMap.find(serviceName)); 308 309 if (sensorItr != sensorMap.end()) 310 { 311 // set all sensors' owner state to not-owned 312 for (auto& sensor : sensorItr->second) 313 { 314 sensor->setOwner(hasOwner); 315 sensor->getFan().process(*sensor); 316 } 317 } 318 } 319 320 void System::updateFanHealth(const Fan& fan) 321 { 322 std::vector<bool> sensorStatus; 323 for (const auto& sensor : fan.sensors()) 324 { 325 sensorStatus.push_back(sensor->functional()); 326 } 327 328 _fanHealth[fan.getName()] = 329 std::make_tuple(fan.present(), std::move(sensorStatus)); 330 } 331 332 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 333 { 334 updateFanHealth(fan); 335 336 if (_powerState->isPowerOn() && !skipRulesCheck) 337 { 338 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 339 [this](auto& rule) { 340 rule->check(PowerRuleState::runtime, _fanHealth); 341 }); 342 } 343 } 344 345 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 346 { 347 #ifdef MONITOR_USE_JSON 348 std::shared_ptr<PowerInterfaceBase> powerInterface = 349 std::make_shared<PowerInterface>(_thermalAlert); 350 351 PowerOffAction::PrePowerOffFunc func = 352 std::bind(std::mem_fn(&System::logShutdownError), this); 353 354 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 355 356 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 357 #endif 358 } 359 360 void System::powerStateChanged(bool powerStateOn) 361 { 362 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 363 fan->powerStateChanged(powerStateOn); 364 }); 365 366 if (powerStateOn) 367 { 368 if (!_loaded) 369 { 370 log<level::ERR>("No conf file found at power on"); 371 throw std::runtime_error("No conf file found at power on"); 372 } 373 374 // If no fan has its sensors on D-Bus, then there is a problem 375 // with the fan controller. Log an error and shut down. 376 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 377 return fan->numSensorsOnDBusAtPowerOn() == 0; 378 })) 379 { 380 handleOfflineFanController(); 381 return; 382 } 383 384 if (_sensorMatch.empty()) 385 { 386 subscribeSensorsToServices(); 387 } 388 389 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 390 [this](auto& rule) { 391 rule->check(PowerRuleState::atPgood, _fanHealth); 392 }); 393 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 394 [this](auto& rule) { 395 rule->check(PowerRuleState::runtime, _fanHealth); 396 }); 397 } 398 else 399 { 400 _thermalAlert.enabled(false); 401 402 // Cancel any in-progress power off actions 403 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 404 [this](auto& rule) { rule->cancel(); }); 405 } 406 } 407 408 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 409 { 410 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 411 412 getLogger().log( 413 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 414 sensor.name()), 415 Logger::error); 416 417 // In order to know if the event log should have a severity of error or 418 // informational, count the number of existing nonfunctional sensors and 419 // compare it to _numNonfuncSensorsBeforeError. 420 size_t nonfuncSensors = 0; 421 for (const auto& fan : _fans) 422 { 423 for (const auto& s : fan->sensors()) 424 { 425 // Don't count nonfunctional sensors that still have their 426 // error timer running as nonfunctional since they haven't 427 // had event logs created for those errors yet. 428 if (!s->functional() && !s->errorTimerRunning()) 429 { 430 nonfuncSensors++; 431 } 432 } 433 } 434 435 Severity severity = Severity::Error; 436 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 437 { 438 severity = Severity::Informational; 439 } 440 441 auto error = 442 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 443 fanPath, sensor.name(), severity); 444 445 auto sensorData = captureSensorData(); 446 error->commit(sensorData); 447 448 // Save the error so it can be committed again on a power off. 449 _lastError = std::move(error); 450 } 451 452 void System::fanMissingErrorTimerExpired(const Fan& fan) 453 { 454 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 455 456 getLogger().log( 457 fmt::format("Creating event log for missing fan {}", fanPath), 458 Logger::error); 459 460 auto error = std::make_unique<FanError>( 461 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 462 463 auto sensorData = captureSensorData(); 464 error->commit(sensorData); 465 466 // Save the error so it can be committed again on a power off. 467 _lastError = std::move(error); 468 } 469 470 void System::logShutdownError() 471 { 472 if (_lastError) 473 { 474 getLogger().log("Re-committing previous fan error before power off"); 475 476 // Still use the latest sensor data 477 auto sensorData = captureSensorData(); 478 _lastError->commit(sensorData, true); 479 } 480 } 481 482 json System::captureSensorData() 483 { 484 json data; 485 486 for (const auto& fan : _fans) 487 { 488 for (const auto& sensor : fan->sensors()) 489 { 490 json values; 491 values["present"] = fan->present(); 492 values["functional"] = sensor->functional(); 493 values["in_range"] = !fan->outOfRange(*sensor); 494 values["tach"] = sensor->getInput(); 495 496 if (sensor->hasTarget()) 497 { 498 values["target"] = sensor->getTarget(); 499 } 500 501 // convert between string/json to remove newlines 502 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 503 504 if (sensor->hasTarget()) 505 { 506 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 507 } 508 509 if (sensor->getMethod() == MethodMode::count) 510 { 511 values["ticks"] = sensor->getCounter(); 512 } 513 data["sensors"][sensor->name()] = values; 514 } 515 } 516 517 return data; 518 } 519 520 void System::handleOfflineFanController() 521 { 522 getLogger().log("The fan controller appears to be offline. Shutting down.", 523 Logger::error); 524 525 auto ffdc = collectHwmonFFDC(); 526 527 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 528 Severity::Critical}; 529 error.commit(ffdc, true); 530 531 PowerInterface::executeHardPowerOff(); 532 533 createBmcDump(); 534 } 535 536 /** 537 * @brief Create a BMC Dump 538 */ 539 void System::createBmcDump() const 540 { 541 try 542 { 543 util::SDBusPlus::callMethod( 544 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 545 "xyz.openbmc_project.Dump.Create", "CreateDump", 546 std::vector< 547 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 548 } 549 catch (const std::exception& e) 550 { 551 getLogger().log( 552 fmt::format("Caught exception while creating BMC dump: {}", 553 e.what()), 554 Logger::error); 555 } 556 } 557 558 void System::dumpDebugData(sdeventplus::source::Signal&, 559 const struct signalfd_siginfo*) 560 { 561 json output; 562 563 if (_loaded) 564 { 565 output["logs"] = getLogger().getLogs(); 566 output["sensors"] = captureSensorData(); 567 } 568 else 569 { 570 output["error"] = "Fan monitor not loaded yet. Try again later."; 571 } 572 573 std::ofstream file{System::dumpFile}; 574 if (!file) 575 { 576 log<level::ERR>("Could not open file for fan monitor dump"); 577 } 578 else 579 { 580 file << std::setw(4) << output; 581 } 582 } 583 584 } // namespace phosphor::fan::monitor 585