1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #include "utility.hpp" 24 #ifdef MONITOR_USE_JSON 25 #include "json_config.hpp" 26 #include "json_parser.hpp" 27 #endif 28 29 #include "config.h" 30 31 #include "hwmon_ffdc.hpp" 32 33 #include <nlohmann/json.hpp> 34 #include <phosphor-logging/log.hpp> 35 #include <sdbusplus/bus.hpp> 36 #include <sdeventplus/event.hpp> 37 #include <sdeventplus/source/signal.hpp> 38 39 namespace phosphor::fan::monitor 40 { 41 42 using json = nlohmann::json; 43 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 44 45 using namespace phosphor::logging; 46 47 System::System(Mode mode, sdbusplus::bus::bus& bus, 48 const sdeventplus::Event& event) : 49 _mode(mode), 50 _bus(bus), _event(event), 51 _powerState(std::make_unique<PGoodState>( 52 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 53 std::placeholders::_1))), 54 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 55 {} 56 57 void System::start() 58 { 59 namespace match = sdbusplus::bus::match; 60 61 // must be done before service detection 62 _inventoryMatch = std::make_unique<match::match>( 63 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 64 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 65 66 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 67 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 68 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 69 70 if (invServiceRunning) 71 { 72 _inventoryMatch.reset(); 73 74 if (!_loaded) 75 { 76 load(); 77 } 78 } 79 } 80 81 void System::load() 82 { 83 json jsonObj = json::object(); 84 #ifdef MONITOR_USE_JSON 85 try 86 { 87 jsonObj = getJsonObj(); 88 #endif 89 auto trustGrps = getTrustGroups(jsonObj); 90 auto fanDefs = getFanDefinitions(jsonObj); 91 // Retrieve and set trust groups within the trust manager 92 setTrustMgr(getTrustGroups(jsonObj)); 93 // Clear/set configured fan definitions 94 _fans.clear(); 95 _fanHealth.clear(); 96 // Retrieve fan definitions and create fan objects to be monitored 97 setFans(fanDefs); 98 setFaultConfig(jsonObj); 99 log<level::INFO>("Configuration loaded"); 100 101 _loaded = true; 102 #ifdef MONITOR_USE_JSON 103 } 104 catch (const phosphor::fan::NoConfigFound&) 105 {} 106 #endif 107 108 if (_powerState->isPowerOn()) 109 { 110 // Fans could be missing on startup, so check the power off rules. 111 // Tach sensors default to functional, so they wouldn't cause a power 112 // off here. 113 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 114 [this](auto& rule) { 115 rule->check(PowerRuleState::runtime, _fanHealth); 116 }); 117 } 118 119 subscribeSensorsToServices(); 120 } 121 122 void System::subscribeSensorsToServices() 123 { 124 namespace match = sdbusplus::bus::match; 125 126 _sensorMatch.clear(); 127 128 SensorMapType sensorMap; 129 130 // build a list of all interfaces, always including the value interface 131 // using set automatically guards against duplicates 132 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 133 134 for (const auto& fan : _fans) 135 { 136 for (const auto& sensor : fan->sensors()) 137 { 138 unique_interfaces.insert(sensor->getInterface()); 139 } 140 } 141 // convert them to vector to pass into getSubTreeRaw 142 std::vector<std::string> interfaces(unique_interfaces.begin(), 143 unique_interfaces.end()); 144 145 try 146 { 147 // get service information for all service names that are 148 // hosting these interfaces 149 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 150 _bus, FAN_SENSOR_PATH, interfaces, 0); 151 152 for (const auto& fan : _fans) 153 { 154 // For every sensor in each fan 155 for (const auto& sensor : fan->sensors()) 156 { 157 const auto itServ = serviceObjects.find(sensor->name()); 158 159 if (serviceObjects.end() == itServ || itServ->second.empty()) 160 { 161 getLogger().log( 162 fmt::format("Fan sensor entry {} not found in D-Bus", 163 sensor->name()), 164 Logger::error); 165 continue; 166 } 167 168 for (const auto& [serviceName, unused] : itServ->second) 169 { 170 // associate service name with sensor 171 sensorMap[serviceName].insert(sensor); 172 } 173 } 174 } 175 176 // only create 1 match per service 177 for (const auto& [serviceName, unused] : sensorMap) 178 { 179 // map its service name to the sensor 180 _sensorMatch.emplace_back(std::make_unique<match::match>( 181 _bus, match::rules::nameOwnerChanged(serviceName), 182 std::bind(&System::tachSignalOffline, this, 183 std::placeholders::_1, sensorMap))); 184 } 185 } 186 catch (const util::DBusError&) 187 { 188 // catch exception from getSubTreeRaw() when fan sensor paths don't 189 // exist yet 190 } 191 } 192 193 void System::inventoryOnlineCb(sdbusplus::message::message& msg) 194 { 195 namespace match = sdbusplus::bus::match; 196 197 std::string iface; 198 msg.read(iface); 199 200 if (util::INVENTORY_INTF != iface) 201 { 202 return; 203 } 204 205 std::string oldName; 206 msg.read(oldName); 207 208 std::string newName; 209 msg.read(newName); 210 211 // newName should never be empty since match was reset on the first 212 // nameOwnerChanged signal received from the service. 213 if (!_loaded && !newName.empty()) 214 { 215 load(); 216 } 217 218 // cancel any further notifications about the service state 219 _inventoryMatch.reset(); 220 } 221 222 void System::sighupHandler(sdeventplus::source::Signal&, 223 const struct signalfd_siginfo*) 224 { 225 try 226 { 227 load(); 228 } 229 catch (std::runtime_error& re) 230 { 231 log<level::ERR>("Error reloading config, no config changes made", 232 entry("LOAD_ERROR=%s", re.what())); 233 } 234 } 235 236 const std::vector<CreateGroupFunction> 237 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 238 { 239 #ifdef MONITOR_USE_JSON 240 return getTrustGrps(jsonObj); 241 #else 242 return trustGroups; 243 #endif 244 } 245 246 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 247 { 248 _trust = std::make_unique<trust::Manager>(groupFuncs); 249 } 250 251 const std::vector<FanDefinition> 252 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 253 { 254 #ifdef MONITOR_USE_JSON 255 return getFanDefs(jsonObj); 256 #else 257 return fanDefinitions; 258 #endif 259 } 260 261 void System::setFans(const std::vector<FanDefinition>& fanDefs) 262 { 263 for (const auto& fanDef : fanDefs) 264 { 265 // Check if a condition exists on the fan 266 auto condition = std::get<conditionField>(fanDef); 267 if (condition) 268 { 269 // Condition exists, skip adding fan if it fails 270 if (!(*condition)(_bus)) 271 { 272 continue; 273 } 274 } 275 _fans.emplace_back( 276 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 277 278 updateFanHealth(*(_fans.back())); 279 } 280 } 281 282 // callback indicating a service went [on|off]line. 283 // Determine on/offline status, set all sensors for that service 284 // to new state 285 // 286 void System::tachSignalOffline(sdbusplus::message::message& msg, 287 SensorMapType const& sensorMap) 288 { 289 std::string serviceName, oldOwner, newOwner; 290 291 msg.read(serviceName); 292 msg.read(oldOwner); 293 msg.read(newOwner); 294 295 // true if sensor server came back online, false -> went offline 296 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 297 298 std::string stateStr(hasOwner ? "online" : "offline"); 299 getLogger().log(fmt::format("Changing sensors for service {} to {}", 300 serviceName, stateStr), 301 Logger::info); 302 303 auto sensorItr(sensorMap.find(serviceName)); 304 305 if (sensorItr != sensorMap.end()) 306 { 307 // set all sensors' owner state to not-owned 308 for (auto& sensor : sensorItr->second) 309 { 310 sensor->setOwner(hasOwner); 311 sensor->getFan().process(*sensor); 312 } 313 } 314 } 315 316 void System::updateFanHealth(const Fan& fan) 317 { 318 std::vector<bool> sensorStatus; 319 for (const auto& sensor : fan.sensors()) 320 { 321 sensorStatus.push_back(sensor->functional()); 322 } 323 324 _fanHealth[fan.getName()] = 325 std::make_tuple(fan.present(), std::move(sensorStatus)); 326 } 327 328 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 329 { 330 updateFanHealth(fan); 331 332 if (_powerState->isPowerOn() && !skipRulesCheck) 333 { 334 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 335 [this](auto& rule) { 336 rule->check(PowerRuleState::runtime, _fanHealth); 337 }); 338 } 339 } 340 341 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 342 { 343 #ifdef MONITOR_USE_JSON 344 std::shared_ptr<PowerInterfaceBase> powerInterface = 345 std::make_shared<PowerInterface>(_thermalAlert); 346 347 PowerOffAction::PrePowerOffFunc func = 348 std::bind(std::mem_fn(&System::logShutdownError), this); 349 350 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 351 352 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 353 #endif 354 } 355 356 void System::powerStateChanged(bool powerStateOn) 357 { 358 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 359 fan->powerStateChanged(powerStateOn); 360 }); 361 362 if (powerStateOn) 363 { 364 if (!_loaded) 365 { 366 log<level::ERR>("No conf file found at power on"); 367 throw std::runtime_error("No conf file found at power on"); 368 } 369 370 // If no fan has its sensors on D-Bus, then there is a problem 371 // with the fan controller. Log an error and shut down. 372 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 373 return fan->numSensorsOnDBusAtPowerOn() == 0; 374 })) 375 { 376 handleOfflineFanController(); 377 return; 378 } 379 380 if (_sensorMatch.empty()) 381 { 382 subscribeSensorsToServices(); 383 } 384 385 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 386 [this](auto& rule) { 387 rule->check(PowerRuleState::atPgood, _fanHealth); 388 }); 389 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 390 [this](auto& rule) { 391 rule->check(PowerRuleState::runtime, _fanHealth); 392 }); 393 } 394 else 395 { 396 _thermalAlert.enabled(false); 397 398 // Cancel any in-progress power off actions 399 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 400 [this](auto& rule) { rule->cancel(); }); 401 } 402 } 403 404 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 405 { 406 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 407 408 getLogger().log( 409 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 410 sensor.name()), 411 Logger::error); 412 413 // In order to know if the event log should have a severity of error or 414 // informational, count the number of existing nonfunctional sensors and 415 // compare it to _numNonfuncSensorsBeforeError. 416 size_t nonfuncSensors = 0; 417 for (const auto& fan : _fans) 418 { 419 for (const auto& s : fan->sensors()) 420 { 421 // Don't count nonfunctional sensors that still have their 422 // error timer running as nonfunctional since they haven't 423 // had event logs created for those errors yet. 424 if (!s->functional() && !s->errorTimerRunning()) 425 { 426 nonfuncSensors++; 427 } 428 } 429 } 430 431 Severity severity = Severity::Error; 432 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 433 { 434 severity = Severity::Informational; 435 } 436 437 auto error = 438 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 439 fanPath, sensor.name(), severity); 440 441 auto sensorData = captureSensorData(); 442 error->commit(sensorData); 443 444 // Save the error so it can be committed again on a power off. 445 _lastError = std::move(error); 446 } 447 448 void System::fanMissingErrorTimerExpired(const Fan& fan) 449 { 450 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 451 452 getLogger().log( 453 fmt::format("Creating event log for missing fan {}", fanPath), 454 Logger::error); 455 456 auto error = std::make_unique<FanError>( 457 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 458 459 auto sensorData = captureSensorData(); 460 error->commit(sensorData); 461 462 // Save the error so it can be committed again on a power off. 463 _lastError = std::move(error); 464 } 465 466 void System::logShutdownError() 467 { 468 if (_lastError) 469 { 470 getLogger().log("Re-committing previous fan error before power off"); 471 472 // Still use the latest sensor data 473 auto sensorData = captureSensorData(); 474 _lastError->commit(sensorData, true); 475 } 476 } 477 478 json System::captureSensorData() 479 { 480 json data; 481 482 for (const auto& fan : _fans) 483 { 484 for (const auto& sensor : fan->sensors()) 485 { 486 json values; 487 values["present"] = fan->present(); 488 values["functional"] = sensor->functional(); 489 values["tach"] = sensor->getInput(); 490 491 if (sensor->hasTarget()) 492 { 493 values["target"] = sensor->getTarget(); 494 } 495 496 // convert between string/json to remove newlines 497 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 498 499 if (sensor->hasTarget()) 500 { 501 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 502 } 503 504 data["sensors"][sensor->name()] = values; 505 } 506 } 507 508 return data; 509 } 510 511 void System::handleOfflineFanController() 512 { 513 getLogger().log("The fan controller appears to be offline. Shutting down.", 514 Logger::error); 515 516 auto ffdc = collectHwmonFFDC(); 517 518 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 519 Severity::Critical}; 520 error.commit(ffdc, true); 521 522 PowerInterface::executeHardPowerOff(); 523 524 createBmcDump(); 525 } 526 527 /** 528 * @brief Create a BMC Dump 529 */ 530 void System::createBmcDump() const 531 { 532 try 533 { 534 util::SDBusPlus::callMethod( 535 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 536 "xyz.openbmc_project.Dump.Create", "CreateDump", 537 std::vector< 538 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 539 } 540 catch (const std::exception& e) 541 { 542 getLogger().log( 543 fmt::format("Caught exception while creating BMC dump: {}", 544 e.what()), 545 Logger::error); 546 } 547 } 548 549 } // namespace phosphor::fan::monitor 550