1 /** 2 * Copyright © 2022 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #include "utility.hpp" 24 #ifdef MONITOR_USE_JSON 25 #include "json_config.hpp" 26 #include "json_parser.hpp" 27 #endif 28 29 #include "config.h" 30 31 #include "hwmon_ffdc.hpp" 32 33 #include <nlohmann/json.hpp> 34 #include <phosphor-logging/log.hpp> 35 #include <sdbusplus/bus.hpp> 36 #include <sdeventplus/event.hpp> 37 #include <sdeventplus/source/signal.hpp> 38 39 namespace phosphor::fan::monitor 40 { 41 42 using json = nlohmann::json; 43 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 44 45 using namespace phosphor::logging; 46 47 System::System(Mode mode, sdbusplus::bus::bus& bus, 48 const sdeventplus::Event& event) : 49 _mode(mode), 50 _bus(bus), _event(event), 51 _powerState(std::make_unique<PGoodState>( 52 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 53 std::placeholders::_1))), 54 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 55 {} 56 57 void System::start() 58 { 59 namespace match = sdbusplus::bus::match; 60 61 // must be done before service detection 62 _inventoryMatch = std::make_unique<match::match>( 63 _bus, match::rules::nameOwnerChanged(util::INVENTORY_SVC), 64 std::bind(&System::inventoryOnlineCb, this, std::placeholders::_1)); 65 66 bool invServiceRunning = util::SDBusPlus::callMethodAndRead<bool>( 67 _bus, "org.freedesktop.DBus", "/org/freedesktop/DBus", 68 "org.freedesktop.DBus", "NameHasOwner", util::INVENTORY_SVC); 69 70 if (invServiceRunning) 71 { 72 _inventoryMatch.reset(); 73 74 if (!_loaded) 75 { 76 load(); 77 } 78 } 79 } 80 81 void System::load() 82 { 83 json jsonObj = json::object(); 84 #ifdef MONITOR_USE_JSON 85 try 86 { 87 jsonObj = getJsonObj(); 88 #endif 89 auto trustGrps = getTrustGroups(jsonObj); 90 auto fanDefs = getFanDefinitions(jsonObj); 91 // Retrieve and set trust groups within the trust manager 92 setTrustMgr(getTrustGroups(jsonObj)); 93 // Clear/set configured fan definitions 94 _fans.clear(); 95 _fanHealth.clear(); 96 // Retrieve fan definitions and create fan objects to be monitored 97 setFans(fanDefs); 98 setFaultConfig(jsonObj); 99 log<level::INFO>("Configuration loaded"); 100 101 _loaded = true; 102 #ifdef MONITOR_USE_JSON 103 } 104 catch (const phosphor::fan::NoConfigFound&) 105 {} 106 #endif 107 108 if (_powerState->isPowerOn()) 109 { 110 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 111 [this](auto& rule) { 112 rule->check(PowerRuleState::runtime, _fanHealth); 113 }); 114 } 115 116 subscribeSensorsToServices(); 117 } 118 119 void System::subscribeSensorsToServices() 120 { 121 namespace match = sdbusplus::bus::match; 122 123 _sensorMatch.clear(); 124 125 SensorMapType sensorMap; 126 127 // build a list of all interfaces, always including the value interface 128 // using set automatically guards against duplicates 129 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 130 131 for (const auto& fan : _fans) 132 { 133 for (const auto& sensor : fan->sensors()) 134 { 135 unique_interfaces.insert(sensor->getInterface()); 136 } 137 } 138 // convert them to vector to pass into getSubTreeRaw 139 std::vector<std::string> interfaces(unique_interfaces.begin(), 140 unique_interfaces.end()); 141 142 try 143 { 144 // get service information for all service names that are 145 // hosting these interfaces 146 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 147 _bus, FAN_SENSOR_PATH, interfaces, 0); 148 149 for (const auto& fan : _fans) 150 { 151 // For every sensor in each fan 152 for (const auto& sensor : fan->sensors()) 153 { 154 const auto itServ = serviceObjects.find(sensor->name()); 155 156 if (serviceObjects.end() == itServ || itServ->second.empty()) 157 { 158 getLogger().log( 159 fmt::format("Fan sensor entry {} not found in D-Bus", 160 sensor->name()), 161 Logger::error); 162 continue; 163 } 164 165 for (const auto& [serviceName, unused] : itServ->second) 166 { 167 // associate service name with sensor 168 sensorMap[serviceName].insert(sensor); 169 } 170 } 171 } 172 173 // only create 1 match per service 174 for (const auto& [serviceName, unused] : sensorMap) 175 { 176 // map its service name to the sensor 177 _sensorMatch.emplace_back(std::make_unique<match::match>( 178 _bus, match::rules::nameOwnerChanged(serviceName), 179 std::bind(&System::tachSignalOffline, this, 180 std::placeholders::_1, sensorMap))); 181 } 182 } 183 catch (const util::DBusError&) 184 { 185 // catch exception from getSubTreeRaw() when fan sensor paths don't 186 // exist yet 187 } 188 } 189 190 void System::inventoryOnlineCb(sdbusplus::message::message& msg) 191 { 192 namespace match = sdbusplus::bus::match; 193 194 std::string iface; 195 msg.read(iface); 196 197 if (util::INVENTORY_INTF != iface) 198 { 199 return; 200 } 201 202 std::string oldName; 203 msg.read(oldName); 204 205 std::string newName; 206 msg.read(newName); 207 208 // newName should never be empty since match was reset on the first 209 // nameOwnerChanged signal received from the service. 210 if (!_loaded && !newName.empty()) 211 { 212 load(); 213 } 214 215 // cancel any further notifications about the service state 216 _inventoryMatch.reset(); 217 } 218 219 void System::sighupHandler(sdeventplus::source::Signal&, 220 const struct signalfd_siginfo*) 221 { 222 try 223 { 224 load(); 225 } 226 catch (std::runtime_error& re) 227 { 228 log<level::ERR>("Error reloading config, no config changes made", 229 entry("LOAD_ERROR=%s", re.what())); 230 } 231 } 232 233 const std::vector<CreateGroupFunction> 234 System::getTrustGroups([[maybe_unused]] const json& jsonObj) 235 { 236 #ifdef MONITOR_USE_JSON 237 return getTrustGrps(jsonObj); 238 #else 239 return trustGroups; 240 #endif 241 } 242 243 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 244 { 245 _trust = std::make_unique<trust::Manager>(groupFuncs); 246 } 247 248 const std::vector<FanDefinition> 249 System::getFanDefinitions([[maybe_unused]] const json& jsonObj) 250 { 251 #ifdef MONITOR_USE_JSON 252 return getFanDefs(jsonObj); 253 #else 254 return fanDefinitions; 255 #endif 256 } 257 258 void System::setFans(const std::vector<FanDefinition>& fanDefs) 259 { 260 for (const auto& fanDef : fanDefs) 261 { 262 // Check if a condition exists on the fan 263 auto condition = std::get<conditionField>(fanDef); 264 if (condition) 265 { 266 // Condition exists, skip adding fan if it fails 267 if (!(*condition)(_bus)) 268 { 269 continue; 270 } 271 } 272 _fans.emplace_back( 273 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 274 275 updateFanHealth(*(_fans.back())); 276 } 277 } 278 279 // callback indicating a service went [on|off]line. 280 // Determine on/offline status, set all sensors for that service 281 // to new state 282 // 283 void System::tachSignalOffline(sdbusplus::message::message& msg, 284 SensorMapType const& sensorMap) 285 { 286 std::string serviceName, oldOwner, newOwner; 287 288 msg.read(serviceName); 289 msg.read(oldOwner); 290 msg.read(newOwner); 291 292 // true if sensor server came back online, false -> went offline 293 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 294 295 std::string stateStr(hasOwner ? "online" : "offline"); 296 getLogger().log(fmt::format("Changing sensors for service {} to {}", 297 serviceName, stateStr), 298 Logger::info); 299 300 auto sensorItr(sensorMap.find(serviceName)); 301 302 if (sensorItr != sensorMap.end()) 303 { 304 // set all sensors' owner state to not-owned 305 for (auto& sensor : sensorItr->second) 306 { 307 sensor->setOwner(hasOwner); 308 sensor->getFan().process(*sensor); 309 } 310 } 311 } 312 313 void System::updateFanHealth(const Fan& fan) 314 { 315 std::vector<bool> sensorStatus; 316 for (const auto& sensor : fan.sensors()) 317 { 318 sensorStatus.push_back(sensor->functional()); 319 } 320 321 _fanHealth[fan.getName()] = 322 std::make_tuple(fan.present(), std::move(sensorStatus)); 323 } 324 325 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 326 { 327 updateFanHealth(fan); 328 329 if (_powerState->isPowerOn() && !skipRulesCheck) 330 { 331 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 332 [this](auto& rule) { 333 rule->check(PowerRuleState::runtime, _fanHealth); 334 }); 335 } 336 } 337 338 void System::setFaultConfig([[maybe_unused]] const json& jsonObj) 339 { 340 #ifdef MONITOR_USE_JSON 341 std::shared_ptr<PowerInterfaceBase> powerInterface = 342 std::make_shared<PowerInterface>(_thermalAlert); 343 344 PowerOffAction::PrePowerOffFunc func = 345 std::bind(std::mem_fn(&System::logShutdownError), this); 346 347 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 348 349 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 350 #endif 351 } 352 353 void System::powerStateChanged(bool powerStateOn) 354 { 355 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 356 fan->powerStateChanged(powerStateOn); 357 }); 358 359 if (powerStateOn) 360 { 361 if (!_loaded) 362 { 363 log<level::ERR>("No conf file found at power on"); 364 throw std::runtime_error("No conf file found at power on"); 365 } 366 367 // If no fan has its sensors on D-Bus, then there is a problem 368 // with the fan controller. Log an error and shut down. 369 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 370 return fan->numSensorsOnDBusAtPowerOn() == 0; 371 })) 372 { 373 handleOfflineFanController(); 374 return; 375 } 376 377 if (_sensorMatch.empty()) 378 { 379 subscribeSensorsToServices(); 380 } 381 382 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 383 [this](auto& rule) { 384 rule->check(PowerRuleState::atPgood, _fanHealth); 385 }); 386 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 387 [this](auto& rule) { 388 rule->check(PowerRuleState::runtime, _fanHealth); 389 }); 390 } 391 else 392 { 393 _thermalAlert.enabled(false); 394 395 // Cancel any in-progress power off actions 396 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 397 [this](auto& rule) { rule->cancel(); }); 398 } 399 } 400 401 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 402 { 403 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 404 405 getLogger().log( 406 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 407 sensor.name()), 408 Logger::error); 409 410 // In order to know if the event log should have a severity of error or 411 // informational, count the number of existing nonfunctional sensors and 412 // compare it to _numNonfuncSensorsBeforeError. 413 size_t nonfuncSensors = 0; 414 for (const auto& fan : _fans) 415 { 416 for (const auto& s : fan->sensors()) 417 { 418 // Don't count nonfunctional sensors that still have their 419 // error timer running as nonfunctional since they haven't 420 // had event logs created for those errors yet. 421 if (!s->functional() && !s->errorTimerRunning()) 422 { 423 nonfuncSensors++; 424 } 425 } 426 } 427 428 Severity severity = Severity::Error; 429 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 430 { 431 severity = Severity::Informational; 432 } 433 434 auto error = 435 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 436 fanPath, sensor.name(), severity); 437 438 auto sensorData = captureSensorData(); 439 error->commit(sensorData); 440 441 // Save the error so it can be committed again on a power off. 442 _lastError = std::move(error); 443 } 444 445 void System::fanMissingErrorTimerExpired(const Fan& fan) 446 { 447 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 448 449 getLogger().log( 450 fmt::format("Creating event log for missing fan {}", fanPath), 451 Logger::error); 452 453 auto error = std::make_unique<FanError>( 454 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 455 456 auto sensorData = captureSensorData(); 457 error->commit(sensorData); 458 459 // Save the error so it can be committed again on a power off. 460 _lastError = std::move(error); 461 } 462 463 void System::logShutdownError() 464 { 465 if (_lastError) 466 { 467 getLogger().log("Re-committing previous fan error before power off"); 468 469 // Still use the latest sensor data 470 auto sensorData = captureSensorData(); 471 _lastError->commit(sensorData, true); 472 } 473 } 474 475 json System::captureSensorData() 476 { 477 json data; 478 479 for (const auto& fan : _fans) 480 { 481 for (const auto& sensor : fan->sensors()) 482 { 483 json values; 484 values["present"] = fan->present(); 485 values["functional"] = sensor->functional(); 486 values["tach"] = sensor->getInput(); 487 488 if (sensor->hasTarget()) 489 { 490 values["target"] = sensor->getTarget(); 491 } 492 493 // convert between string/json to remove newlines 494 values["prev_tachs"] = json(sensor->getPrevTach()).dump(); 495 496 if (sensor->hasTarget()) 497 { 498 values["prev_targets"] = json(sensor->getPrevTarget()).dump(); 499 } 500 501 data["sensors"][sensor->name()] = values; 502 } 503 } 504 505 return data; 506 } 507 508 void System::handleOfflineFanController() 509 { 510 getLogger().log("The fan controller appears to be offline. Shutting down.", 511 Logger::error); 512 513 auto ffdc = collectHwmonFFDC(); 514 515 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 516 Severity::Critical}; 517 error.commit(ffdc, true); 518 519 PowerInterface::executeHardPowerOff(); 520 521 createBmcDump(); 522 } 523 524 /** 525 * @brief Create a BMC Dump 526 */ 527 void System::createBmcDump() const 528 { 529 try 530 { 531 util::SDBusPlus::callMethod( 532 "xyz.openbmc_project.Dump.Manager", "/xyz/openbmc_project/dump/bmc", 533 "xyz.openbmc_project.Dump.Create", "CreateDump", 534 std::vector< 535 std::pair<std::string, std::variant<std::string, uint64_t>>>()); 536 } 537 catch (const sdbusplus::exception::exception&) 538 {} 539 } 540 541 } // namespace phosphor::fan::monitor 542