1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "dbus_paths.hpp" 19 #include "fan.hpp" 20 #include "fan_defs.hpp" 21 #include "tach_sensor.hpp" 22 #include "trust_manager.hpp" 23 #include "types.hpp" 24 #include "utility.hpp" 25 #ifdef MONITOR_USE_JSON 26 #include "json_config.hpp" 27 #include "json_parser.hpp" 28 #endif 29 30 #include "config.h" 31 32 #include "hwmon_ffdc.hpp" 33 34 #include <nlohmann/json.hpp> 35 #include <phosphor-logging/log.hpp> 36 #include <sdbusplus/bus.hpp> 37 #include <sdbusplus/bus/match.hpp> 38 #include <sdeventplus/event.hpp> 39 #include <sdeventplus/source/signal.hpp> 40 41 namespace phosphor::fan::monitor 42 { 43 44 using json = nlohmann::json; 45 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 46 47 using namespace phosphor::logging; 48 49 System::System(Mode mode, sdbusplus::bus_t& bus, 50 const sdeventplus::Event& event) : 51 _mode(mode), 52 _bus(bus), _event(event), 53 _powerState(std::make_unique<PGoodState>( 54 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 55 std::placeholders::_1))), 56 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 57 {} 58 59 void System::start() 60 { 61 namespace match = sdbusplus::bus::match; 62 63 // must be done before service detection 64 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>( 65 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 66 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 67 68 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 69 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 70 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 71 72 if (invServiceRunning) 73 { 74 _inventoryMatch.reset(); 75 76 if (!_loaded) 77 { 78 load(); 79 } 80 } 81 } 82 83 void System::load() 84 { 85 json jsonObj = json::object(); 86 #ifdef MONITOR_USE_JSON 87 try 88 { 89 jsonObj = getJsonObj(); 90 #endif 91 auto trustGrps = getTrustGroups(jsonObj); 92 auto fanDefs = getFanDefinitions(jsonObj); 93 // Retrieve and set trust groups within the trust manager 94 setTrustMgr(getTrustGroups(jsonObj)); 95 // Clear/set configured fan definitions 96 _fans.clear(); 97 _fanHealth.clear(); 98 // Retrieve fan definitions and create fan objects to be monitored 99 setFans(fanDefs); 100 setFaultConfig(jsonObj); 101 log<level::INFO>("Configuration loaded"); 102 103 _loaded = true; 104 #ifdef MONITOR_USE_JSON 105 } 106 catch (const phosphor::fan::NoConfigFound&) 107 {} 108 #endif 109 110 if (_powerState->isPowerOn()) 111 { 112 // Fans could be missing on startup, so check the power off rules. 113 // Tach sensors default to functional, so they wouldn't cause a power 114 // off here. 115 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 116 [this](auto& rule) { 117 rule->check(PowerRuleState::runtime, _fanHealth); 118 }); 119 } 120 121 subscribeSensorsToServices(); 122 } 123 124 void System::subscribeSensorsToServices() 125 { 126 namespace match = sdbusplus::bus::match; 127 128 _sensorMatch.clear(); 129 130 SensorMapType sensorMap; 131 132 // build a list of all interfaces, always including the value interface 133 // using set automatically guards against duplicates 134 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 135 136 for (const auto& fan : _fans) 137 { 138 for (const auto& sensor : fan->sensors()) 139 { 140 unique_interfaces.insert(sensor->getInterface()); 141 } 142 } 143 // convert them to vector to pass into getSubTreeRaw 144 std::vector<std::string> interfaces(unique_interfaces.begin(), 145 unique_interfaces.end()); 146 147 try 148 { 149 // get service information for all service names that are 150 // hosting these interfaces 151 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 152 _bus, FAN_SENSOR_PATH, interfaces, 0); 153 154 for (const auto& fan : _fans) 155 { 156 // For every sensor in each fan 157 for (const auto& sensor : fan->sensors()) 158 { 159 const auto itServ = serviceObjects.find(sensor->name()); 160 161 if (serviceObjects.end() == itServ || itServ->second.empty()) 162 { 163 getLogger().log( 164 fmt::format("Fan sensor entry {} not found in D-Bus", 165 sensor->name()), 166 Logger::error); 167 continue; 168 } 169 170 for (const auto& [serviceName, unused] : itServ->second) 171 { 172 // associate service name with sensor 173 sensorMap[serviceName].insert(sensor); 174 } 175 } 176 } 177 178 // only create 1 match per service 179 for (const auto& [serviceName, unused] : sensorMap) 180 { 181 // map its service name to the sensor 182 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>( 183 _bus, match::rules::nameOwnerChanged(serviceName), 184 std::bind(&System::tachSignalOffline, this, 185 std::placeholders::_1, sensorMap))); 186 } 187 } 188 catch (const util::DBusError&) 189 { 190 // catch exception from getSubTreeRaw() when fan sensor paths don't 191 // exist yet 192 } 193 } 194 195 void System::inventoryOnlineCb(sdbusplus::message_t& msg) 196 { 197 namespace match = sdbusplus::bus::match; 198 199 std::string iface; 200 msg.read(iface); 201 202 if (util::INVENTORY_INTF != iface) 203 { 204 return; 205 } 206 207 std::string oldName; 208 msg.read(oldName); 209 210 std::string newName; 211 msg.read(newName); 212 213 // newName should never be empty since match was reset on the first 214 // nameOwnerChanged signal received from the service. 215 if (!_loaded && !newName.empty()) 216 { 217 load(); 218 } 219 220 // cancel any further notifications about the service state 221 _inventoryMatch.reset(); 222 } 223 224 void System::sighupHandler(sdeventplus::source::Signal&, 225 const struct signalfd_siginfo*) 226 { 227 try 228 { 229 load(); 230 } 231 catch (std::runtime_error& re) 232 { 233 log<level::ERR>("Error reloading config, no config changes made", 234 entry("LOAD_ERROR=%s", re.what())); 235 } 236 } 237 238 const std::vector<CreateGroupFunction> 239 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 240 { 241 #ifdef MONITOR_USE_JSON 242 return getTrustGrps(jsonObj); 243 #else 244 return trustGroups; 245 #endif 246 } 247 248 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 249 { 250 _trust = std::make_unique<trust::Manager>(groupFuncs); 251 } 252 253 const std::vector<FanDefinition> 254 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 255 { 256 #ifdef MONITOR_USE_JSON 257 return getFanDefs(jsonObj); 258 #else 259 return fanDefinitions; 260 #endif 261 } 262 263 void System::setFans(const std::vector<FanDefinition>& fanDefs) 264 { 265 for (const auto& fanDef : fanDefs) 266 { 267 // Check if a condition exists on the fan 268 auto condition = std::get<conditionField>(fanDef); 269 if (condition) 270 { 271 // Condition exists, skip adding fan if it fails 272 if (!(*condition)(_bus)) 273 { 274 continue; 275 } 276 } 277 _fans.emplace_back( 278 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 279 280 updateFanHealth(*(_fans.back())); 281 } 282 } 283 284 // callback indicating a service went [on|off]line. 285 // Determine on/offline status, set all sensors for that service 286 // to new state 287 // 288 void System::tachSignalOffline(sdbusplus::message_t& msg, 289 SensorMapType const& sensorMap) 290 { 291 std::string serviceName, oldOwner, newOwner; 292 293 msg.read(serviceName); 294 msg.read(oldOwner); 295 msg.read(newOwner); 296 297 // true if sensor server came back online, false -> went offline 298 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 299 300 std::string stateStr(hasOwner ? "online" : "offline"); 301 getLogger().log(fmt::format("Changing sensors for service {} to {}", 302 serviceName, stateStr), 303 Logger::info); 304 305 auto sensorItr(sensorMap.find(serviceName)); 306 307 if (sensorItr != sensorMap.end()) 308 { 309 // set all sensors' owner state to not-owned 310 for (auto& sensor : sensorItr->second) 311 { 312 sensor->setOwner(hasOwner); 313 sensor->getFan().process(*sensor); 314 } 315 } 316 } 317 318 void System::updateFanHealth(const Fan& fan) 319 { 320 std::vector<bool> sensorStatus; 321 for (const auto& sensor : fan.sensors()) 322 { 323 sensorStatus.push_back(sensor->functional()); 324 } 325 326 _fanHealth[fan.getName()] = 327 std::make_tuple(fan.present(), std::move(sensorStatus)); 328 } 329 330 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 331 { 332 updateFanHealth(fan); 333 334 if (_powerState->isPowerOn() && !skipRulesCheck) 335 { 336 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 337 [this](auto& rule) { 338 rule->check(PowerRuleState::runtime, _fanHealth); 339 }); 340 } 341 } 342 343 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 344 { 345 #ifdef MONITOR_USE_JSON 346 std::shared_ptr<PowerInterfaceBase> powerInterface = 347 std::make_shared<PowerInterface>(_thermalAlert); 348 349 PowerOffAction::PrePowerOffFunc func = 350 std::bind(std::mem_fn(&System::logShutdownError), this); 351 352 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 353 354 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 355 #endif 356 } 357 358 void System::powerStateChanged(bool powerStateOn) 359 { 360 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 361 fan->powerStateChanged(powerStateOn); 362 }); 363 364 if (powerStateOn) 365 { 366 if (!_loaded) 367 { 368 log<level::ERR>("No conf file found at power on"); 369 throw std::runtime_error("No conf file found at power on"); 370 } 371 372 // If no fan has its sensors on D-Bus, then there is a problem 373 // with the fan controller. Log an error and shut down. 374 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 375 return fan->numSensorsOnDBusAtPowerOn() == 0; 376 })) 377 { 378 handleOfflineFanController(); 379 return; 380 } 381 382 if (_sensorMatch.empty()) 383 { 384 subscribeSensorsToServices(); 385 } 386 387 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 388 [this](auto& rule) { 389 rule->check(PowerRuleState::atPgood, _fanHealth); 390 }); 391 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 392 [this](auto& rule) { 393 rule->check(PowerRuleState::runtime, _fanHealth); 394 }); 395 } 396 else 397 { 398 _thermalAlert.enabled(false); 399 400 // Cancel any in-progress power off actions 401 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 402 [this](auto& rule) { rule->cancel(); }); 403 } 404 } 405 406 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 407 { 408 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 409 410 getLogger().log( 411 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 412 sensor.name()), 413 Logger::error); 414 415 // In order to know if the event log should have a severity of error or 416 // informational, count the number of existing nonfunctional sensors and 417 // compare it to _numNonfuncSensorsBeforeError. 418 size_t nonfuncSensors = 0; 419 for (const auto& fan : _fans) 420 { 421 for (const auto& s : fan->sensors()) 422 { 423 // Don't count nonfunctional sensors that still have their 424 // error timer running as nonfunctional since they haven't 425 // had event logs created for those errors yet. 426 if (!s->functional() && !s->errorTimerRunning()) 427 { 428 nonfuncSensors++; 429 } 430 } 431 } 432 433 Severity severity = Severity::Error; 434 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 435 { 436 severity = Severity::Informational; 437 } 438 439 auto error = 440 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 441 fanPath, sensor.name(), severity); 442 443 auto sensorData = captureSensorData(); 444 error->commit(sensorData); 445 446 // Save the error so it can be committed again on a power off. 447 _lastError = std::move(error); 448 } 449 450 void System::fanMissingErrorTimerExpired(const Fan& fan) 451 { 452 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 453 454 getLogger().log( 455 fmt::format("Creating event log for missing fan {}", fanPath), 456 Logger::error); 457 458 auto error = std::make_unique<FanError>( 459 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 460 461 auto sensorData = captureSensorData(); 462 error->commit(sensorData); 463 464 // Save the error so it can be committed again on a power off. 465 _lastError = std::move(error); 466 } 467 468 void System::logShutdownError() 469 { 470 if (_lastError) 471 { 472 getLogger().log("Re-committing previous fan error before power off"); 473 474 // Still use the latest sensor data 475 auto sensorData = captureSensorData(); 476 _lastError->commit(sensorData, true); 477 } 478 } 479 480 json System::captureSensorData() 481 { 482 json data; 483 484 for (const auto& fan : _fans) 485 { 486 for (const auto& sensor : fan->sensors()) 487 { 488 json values; 489 values["present"] = fan->present(); 490 values["functional"] = sensor->functional(); 491 values["tach"] = sensor->getInput(); 492 493 if (sensor->hasTarget()) 494 { 495 values["target"] = sensor->getTarget(); 496 } 497 498 // convert between string/json to remove newlines 499 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 500 501 if (sensor->hasTarget()) 502 { 503 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 504 } 505 506 if (sensor->getMethod() == MethodMode::count) 507 { 508 values["ticks"] = sensor->getCounter(); 509 } 510 data["sensors"][sensor->name()] = values; 511 } 512 } 513 514 return data; 515 } 516 517 void System::handleOfflineFanController() 518 { 519 getLogger().log("The fan controller appears to be offline. Shutting down.", 520 Logger::error); 521 522 auto ffdc = collectHwmonFFDC(); 523 524 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 525 Severity::Critical}; 526 error.commit(ffdc, true); 527 528 PowerInterface::executeHardPowerOff(); 529 530 createBmcDump(); 531 } 532 533 /** 534 * @brief Create a BMC Dump 535 */ 536 void System::createBmcDump() const 537 { 538 try 539 { 540 util::SDBusPlus::callMethod( 541 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 542 "xyz.openbmc_project.Dump.Create", "CreateDump", 543 std::vector< 544 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 545 } 546 catch (const std::exception& e) 547 { 548 getLogger().log( 549 fmt::format("Caught exception while creating BMC dump: {}", 550 e.what()), 551 Logger::error); 552 } 553 } 554 555 } // namespace phosphor::fan::monitor 556