1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #include "utility.hpp" 24 #ifdef MONITOR_USE_JSON 25 #include "json_config.hpp" 26 #include "json_parser.hpp" 27 #endif 28 29 #include "config.h" 30 31 #include "hwmon_ffdc.hpp" 32 33 #include <nlohmann/json.hpp> 34 #include <phosphor-logging/log.hpp> 35 #include <sdbusplus/bus.hpp> 36 #include <sdbusplus/bus/match.hpp> 37 #include <sdeventplus/event.hpp> 38 #include <sdeventplus/source/signal.hpp> 39 40 namespace phosphor::fan::monitor 41 { 42 43 using json = nlohmann::json; 44 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 45 46 using namespace phosphor::logging; 47 48 System::System(Mode mode, sdbusplus::bus_t& bus, 49 const sdeventplus::Event& event) : 50 _mode(mode), 51 _bus(bus), _event(event), 52 _powerState(std::make_unique<PGoodState>( 53 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 54 std::placeholders::_1))), 55 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 56 {} 57 58 void System::start() 59 { 60 namespace match = sdbusplus::bus::match; 61 62 // must be done before service detection 63 _inventoryMatch = std::make_unique<sdbusplus::bus::match_t>( 64 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 65 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 66 67 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 68 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 69 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 70 71 if (invServiceRunning) 72 { 73 _inventoryMatch.reset(); 74 75 if (!_loaded) 76 { 77 load(); 78 } 79 } 80 } 81 82 void System::load() 83 { 84 json jsonObj = json::object(); 85 #ifdef MONITOR_USE_JSON 86 try 87 { 88 jsonObj = getJsonObj(); 89 #endif 90 auto trustGrps = getTrustGroups(jsonObj); 91 auto fanDefs = getFanDefinitions(jsonObj); 92 // Retrieve and set trust groups within the trust manager 93 setTrustMgr(getTrustGroups(jsonObj)); 94 // Clear/set configured fan definitions 95 _fans.clear(); 96 _fanHealth.clear(); 97 // Retrieve fan definitions and create fan objects to be monitored 98 setFans(fanDefs); 99 setFaultConfig(jsonObj); 100 log<level::INFO>("Configuration loaded"); 101 102 _loaded = true; 103 #ifdef MONITOR_USE_JSON 104 } 105 catch (const phosphor::fan::NoConfigFound&) 106 {} 107 #endif 108 109 if (_powerState->isPowerOn()) 110 { 111 // Fans could be missing on startup, so check the power off rules. 112 // Tach sensors default to functional, so they wouldn't cause a power 113 // off here. 114 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 115 [this](auto& rule) { 116 rule->check(PowerRuleState::runtime, _fanHealth); 117 }); 118 } 119 120 subscribeSensorsToServices(); 121 } 122 123 void System::subscribeSensorsToServices() 124 { 125 namespace match = sdbusplus::bus::match; 126 127 _sensorMatch.clear(); 128 129 SensorMapType sensorMap; 130 131 // build a list of all interfaces, always including the value interface 132 // using set automatically guards against duplicates 133 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 134 135 for (const auto& fan : _fans) 136 { 137 for (const auto& sensor : fan->sensors()) 138 { 139 unique_interfaces.insert(sensor->getInterface()); 140 } 141 } 142 // convert them to vector to pass into getSubTreeRaw 143 std::vector<std::string> interfaces(unique_interfaces.begin(), 144 unique_interfaces.end()); 145 146 try 147 { 148 // get service information for all service names that are 149 // hosting these interfaces 150 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 151 _bus, FAN_SENSOR_PATH, interfaces, 0); 152 153 for (const auto& fan : _fans) 154 { 155 // For every sensor in each fan 156 for (const auto& sensor : fan->sensors()) 157 { 158 const auto itServ = serviceObjects.find(sensor->name()); 159 160 if (serviceObjects.end() == itServ || itServ->second.empty()) 161 { 162 getLogger().log( 163 fmt::format("Fan sensor entry {} not found in D-Bus", 164 sensor->name()), 165 Logger::error); 166 continue; 167 } 168 169 for (const auto& [serviceName, unused] : itServ->second) 170 { 171 // associate service name with sensor 172 sensorMap[serviceName].insert(sensor); 173 } 174 } 175 } 176 177 // only create 1 match per service 178 for (const auto& [serviceName, unused] : sensorMap) 179 { 180 // map its service name to the sensor 181 _sensorMatch.emplace_back(std::make_unique<sdbusplus::bus::match_t>( 182 _bus, match::rules::nameOwnerChanged(serviceName), 183 std::bind(&System::tachSignalOffline, this, 184 std::placeholders::_1, sensorMap))); 185 } 186 } 187 catch (const util::DBusError&) 188 { 189 // catch exception from getSubTreeRaw() when fan sensor paths don't 190 // exist yet 191 } 192 } 193 194 void System::inventoryOnlineCb(sdbusplus::message_t& msg) 195 { 196 namespace match = sdbusplus::bus::match; 197 198 std::string iface; 199 msg.read(iface); 200 201 if (util::INVENTORY_INTF != iface) 202 { 203 return; 204 } 205 206 std::string oldName; 207 msg.read(oldName); 208 209 std::string newName; 210 msg.read(newName); 211 212 // newName should never be empty since match was reset on the first 213 // nameOwnerChanged signal received from the service. 214 if (!_loaded && !newName.empty()) 215 { 216 load(); 217 } 218 219 // cancel any further notifications about the service state 220 _inventoryMatch.reset(); 221 } 222 223 void System::sighupHandler(sdeventplus::source::Signal&, 224 const struct signalfd_siginfo*) 225 { 226 try 227 { 228 load(); 229 } 230 catch (std::runtime_error& re) 231 { 232 log<level::ERR>("Error reloading config, no config changes made", 233 entry("LOAD_ERROR=%s", re.what())); 234 } 235 } 236 237 const std::vector<CreateGroupFunction> 238 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 239 { 240 #ifdef MONITOR_USE_JSON 241 return getTrustGrps(jsonObj); 242 #else 243 return trustGroups; 244 #endif 245 } 246 247 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 248 { 249 _trust = std::make_unique<trust::Manager>(groupFuncs); 250 } 251 252 const std::vector<FanDefinition> 253 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 254 { 255 #ifdef MONITOR_USE_JSON 256 return getFanDefs(jsonObj); 257 #else 258 return fanDefinitions; 259 #endif 260 } 261 262 void System::setFans(const std::vector<FanDefinition>& fanDefs) 263 { 264 for (const auto& fanDef : fanDefs) 265 { 266 // Check if a condition exists on the fan 267 auto condition = std::get<conditionField>(fanDef); 268 if (condition) 269 { 270 // Condition exists, skip adding fan if it fails 271 if (!(*condition)(_bus)) 272 { 273 continue; 274 } 275 } 276 _fans.emplace_back( 277 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 278 279 updateFanHealth(*(_fans.back())); 280 } 281 } 282 283 // callback indicating a service went [on|off]line. 284 // Determine on/offline status, set all sensors for that service 285 // to new state 286 // 287 void System::tachSignalOffline(sdbusplus::message_t& msg, 288 SensorMapType const& sensorMap) 289 { 290 std::string serviceName, oldOwner, newOwner; 291 292 msg.read(serviceName); 293 msg.read(oldOwner); 294 msg.read(newOwner); 295 296 // true if sensor server came back online, false -> went offline 297 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 298 299 std::string stateStr(hasOwner ? "online" : "offline"); 300 getLogger().log(fmt::format("Changing sensors for service {} to {}", 301 serviceName, stateStr), 302 Logger::info); 303 304 auto sensorItr(sensorMap.find(serviceName)); 305 306 if (sensorItr != sensorMap.end()) 307 { 308 // set all sensors' owner state to not-owned 309 for (auto& sensor : sensorItr->second) 310 { 311 sensor->setOwner(hasOwner); 312 sensor->getFan().process(*sensor); 313 } 314 } 315 } 316 317 void System::updateFanHealth(const Fan& fan) 318 { 319 std::vector<bool> sensorStatus; 320 for (const auto& sensor : fan.sensors()) 321 { 322 sensorStatus.push_back(sensor->functional()); 323 } 324 325 _fanHealth[fan.getName()] = 326 std::make_tuple(fan.present(), std::move(sensorStatus)); 327 } 328 329 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 330 { 331 updateFanHealth(fan); 332 333 if (_powerState->isPowerOn() && !skipRulesCheck) 334 { 335 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 336 [this](auto& rule) { 337 rule->check(PowerRuleState::runtime, _fanHealth); 338 }); 339 } 340 } 341 342 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 343 { 344 #ifdef MONITOR_USE_JSON 345 std::shared_ptr<PowerInterfaceBase> powerInterface = 346 std::make_shared<PowerInterface>(_thermalAlert); 347 348 PowerOffAction::PrePowerOffFunc func = 349 std::bind(std::mem_fn(&System::logShutdownError), this); 350 351 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 352 353 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 354 #endif 355 } 356 357 void System::powerStateChanged(bool powerStateOn) 358 { 359 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 360 fan->powerStateChanged(powerStateOn); 361 }); 362 363 if (powerStateOn) 364 { 365 if (!_loaded) 366 { 367 log<level::ERR>("No conf file found at power on"); 368 throw std::runtime_error("No conf file found at power on"); 369 } 370 371 // If no fan has its sensors on D-Bus, then there is a problem 372 // with the fan controller. Log an error and shut down. 373 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 374 return fan->numSensorsOnDBusAtPowerOn() == 0; 375 })) 376 { 377 handleOfflineFanController(); 378 return; 379 } 380 381 if (_sensorMatch.empty()) 382 { 383 subscribeSensorsToServices(); 384 } 385 386 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 387 [this](auto& rule) { 388 rule->check(PowerRuleState::atPgood, _fanHealth); 389 }); 390 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 391 [this](auto& rule) { 392 rule->check(PowerRuleState::runtime, _fanHealth); 393 }); 394 } 395 else 396 { 397 _thermalAlert.enabled(false); 398 399 // Cancel any in-progress power off actions 400 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 401 [this](auto& rule) { rule->cancel(); }); 402 } 403 } 404 405 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 406 { 407 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 408 409 getLogger().log( 410 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 411 sensor.name()), 412 Logger::error); 413 414 // In order to know if the event log should have a severity of error or 415 // informational, count the number of existing nonfunctional sensors and 416 // compare it to _numNonfuncSensorsBeforeError. 417 size_t nonfuncSensors = 0; 418 for (const auto& fan : _fans) 419 { 420 for (const auto& s : fan->sensors()) 421 { 422 // Don't count nonfunctional sensors that still have their 423 // error timer running as nonfunctional since they haven't 424 // had event logs created for those errors yet. 425 if (!s->functional() && !s->errorTimerRunning()) 426 { 427 nonfuncSensors++; 428 } 429 } 430 } 431 432 Severity severity = Severity::Error; 433 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 434 { 435 severity = Severity::Informational; 436 } 437 438 auto error = 439 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 440 fanPath, sensor.name(), severity); 441 442 auto sensorData = captureSensorData(); 443 error->commit(sensorData); 444 445 // Save the error so it can be committed again on a power off. 446 _lastError = std::move(error); 447 } 448 449 void System::fanMissingErrorTimerExpired(const Fan& fan) 450 { 451 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 452 453 getLogger().log( 454 fmt::format("Creating event log for missing fan {}", fanPath), 455 Logger::error); 456 457 auto error = std::make_unique<FanError>( 458 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 459 460 auto sensorData = captureSensorData(); 461 error->commit(sensorData); 462 463 // Save the error so it can be committed again on a power off. 464 _lastError = std::move(error); 465 } 466 467 void System::logShutdownError() 468 { 469 if (_lastError) 470 { 471 getLogger().log("Re-committing previous fan error before power off"); 472 473 // Still use the latest sensor data 474 auto sensorData = captureSensorData(); 475 _lastError->commit(sensorData, true); 476 } 477 } 478 479 json System::captureSensorData() 480 { 481 json data; 482 483 for (const auto& fan : _fans) 484 { 485 for (const auto& sensor : fan->sensors()) 486 { 487 json values; 488 values["present"] = fan->present(); 489 values["functional"] = sensor->functional(); 490 values["tach"] = sensor->getInput(); 491 492 if (sensor->hasTarget()) 493 { 494 values["target"] = sensor->getTarget(); 495 } 496 497 // convert between string/json to remove newlines 498 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 499 500 if (sensor->hasTarget()) 501 { 502 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 503 } 504 505 if (sensor->getMethod() == MethodMode::count) 506 { 507 values["ticks"] = sensor->getCounter(); 508 } 509 data["sensors"][sensor->name()] = values; 510 } 511 } 512 513 return data; 514 } 515 516 void System::handleOfflineFanController() 517 { 518 getLogger().log("The fan controller appears to be offline. Shutting down.", 519 Logger::error); 520 521 auto ffdc = collectHwmonFFDC(); 522 523 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 524 Severity::Critical}; 525 error.commit(ffdc, true); 526 527 PowerInterface::executeHardPowerOff(); 528 529 createBmcDump(); 530 } 531 532 /** 533 * @brief Create a BMC Dump 534 */ 535 void System::createBmcDump() const 536 { 537 try 538 { 539 util::SDBusPlus::callMethod( 540 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 541 "xyz.openbmc_project.Dump.Create", "CreateDump", 542 std::vector< 543 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 544 } 545 catch (const std::exception& e) 546 { 547 getLogger().log( 548 fmt::format("Caught exception while creating BMC dump: {}", 549 e.what()), 550 Logger::error); 551 } 552 } 553 554 } // namespace phosphor::fan::monitor 555