1 /** 2 * Copyright © 2021 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #include "utility.hpp" 24 #ifdef MONITOR_USE_JSON 25 #include "json_parser.hpp" 26 #endif 27 28 #include "config.h" 29 30 #include "hwmon_ffdc.hpp" 31 32 #include <nlohmann/json.hpp> 33 #include <phosphor-logging/log.hpp> 34 #include <sdbusplus/bus.hpp> 35 #include <sdeventplus/event.hpp> 36 #include <sdeventplus/source/signal.hpp> 37 38 namespace phosphor::fan::monitor 39 { 40 41 using json = nlohmann::json; 42 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 43 44 using namespace phosphor::logging; 45 46 System::System(Mode mode, sdbusplus::bus::bus& bus, 47 const sdeventplus::Event& event) : 48 _mode(mode), 49 _bus(bus), _event(event), 50 _powerState(std::make_unique<PGoodState>( 51 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 52 std::placeholders::_1))), 53 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 54 {} 55 56 void System::start() 57 { 58 _started = true; 59 json jsonObj = json::object(); 60 #ifdef MONITOR_USE_JSON 61 auto confFile = 62 fan::JsonConfig::getConfFile(_bus, confAppName, confFileName); 63 jsonObj = fan::JsonConfig::load(confFile); 64 #endif 65 // Retrieve and set trust groups within the trust manager 66 setTrustMgr(getTrustGroups(jsonObj)); 67 // Retrieve fan definitions and create fan objects to be monitored 68 setFans(getFanDefinitions(jsonObj)); 69 setFaultConfig(jsonObj); 70 log<level::INFO>("Configuration loaded"); 71 72 if (_powerState->isPowerOn()) 73 { 74 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 75 [this](auto& rule) { 76 rule->check(PowerRuleState::runtime, _fanHealth); 77 }); 78 } 79 80 if (_sensorMatch.empty()) 81 { 82 subscribeSensorsToServices(); 83 } 84 } 85 86 void System::subscribeSensorsToServices() 87 { 88 namespace match = sdbusplus::bus::match; 89 90 SensorMapType sensorMap; 91 92 // build a list of all interfaces, always including the value interface 93 // using set automatically guards against duplicates 94 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 95 96 for (const auto& fan : _fans) 97 { 98 for (const auto& sensor : fan->sensors()) 99 { 100 unique_interfaces.insert(sensor->getInterface()); 101 } 102 } 103 // convert them to vector to pass into getSubTreeRaw 104 std::vector<std::string> interfaces(unique_interfaces.begin(), 105 unique_interfaces.end()); 106 107 try 108 { 109 // get service information for all service names that are 110 // hosting these interfaces 111 auto serviceObjects = util::SDBusPlus::getSubTreeRaw( 112 _bus, FAN_SENSOR_PATH, interfaces, 0); 113 114 for (const auto& fan : _fans) 115 { 116 // For every sensor in each fan 117 for (const auto& sensor : fan->sensors()) 118 { 119 const auto itServ = serviceObjects.find(sensor->name()); 120 121 if (serviceObjects.end() == itServ || itServ->second.empty()) 122 { 123 getLogger().log( 124 fmt::format("Fan sensor entry {} not found in D-Bus", 125 sensor->name()), 126 Logger::error); 127 continue; 128 } 129 130 for (const auto& [serviceName, unused] : itServ->second) 131 { 132 // associate service name with sensor 133 sensorMap[serviceName].insert(sensor); 134 } 135 } 136 } 137 138 // only create 1 match per service 139 for (const auto& [serviceName, unused] : sensorMap) 140 { 141 // map its service name to the sensor 142 _sensorMatch.emplace_back(std::make_unique<match::match>( 143 _bus, match::rules::nameOwnerChanged(serviceName), 144 std::bind(&System::tachSignalOffline, this, 145 std::placeholders::_1, sensorMap))); 146 } 147 } 148 catch (const util::DBusError&) 149 { 150 // catch exception from getSubTreeRaw() when fan sensor paths don't 151 // exist yet 152 } 153 } 154 155 void System::sighupHandler(sdeventplus::source::Signal&, 156 const struct signalfd_siginfo*) 157 { 158 try 159 { 160 json jsonObj = json::object(); 161 #ifdef MONITOR_USE_JSON 162 jsonObj = getJsonObj(_bus); 163 #endif 164 auto trustGrps = getTrustGroups(jsonObj); 165 auto fanDefs = getFanDefinitions(jsonObj); 166 // Set configured trust groups 167 setTrustMgr(trustGrps); 168 // Clear/set configured fan definitions 169 _fans.clear(); 170 _fanHealth.clear(); 171 setFans(fanDefs); 172 setFaultConfig(jsonObj); 173 log<level::INFO>("Configuration reloaded successfully"); 174 175 if (_powerState->isPowerOn()) 176 { 177 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 178 [this](auto& rule) { 179 rule->check(PowerRuleState::runtime, _fanHealth); 180 }); 181 } 182 183 _sensorMatch.clear(); 184 subscribeSensorsToServices(); 185 } 186 catch (std::runtime_error& re) 187 { 188 log<level::ERR>("Error reloading config, no config changes made", 189 entry("LOAD_ERROR=%s", re.what())); 190 } 191 } 192 193 const std::vector<CreateGroupFunction> 194 System::getTrustGroups(const json& jsonObj) 195 { 196 #ifdef MONITOR_USE_JSON 197 return getTrustGrps(jsonObj); 198 #else 199 return trustGroups; 200 #endif 201 } 202 203 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 204 { 205 _trust = std::make_unique<trust::Manager>(groupFuncs); 206 } 207 208 const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj) 209 { 210 #ifdef MONITOR_USE_JSON 211 return getFanDefs(jsonObj); 212 #else 213 return fanDefinitions; 214 #endif 215 } 216 217 void System::setFans(const std::vector<FanDefinition>& fanDefs) 218 { 219 for (const auto& fanDef : fanDefs) 220 { 221 // Check if a condition exists on the fan 222 auto condition = std::get<conditionField>(fanDef); 223 if (condition) 224 { 225 // Condition exists, skip adding fan if it fails 226 if (!(*condition)(_bus)) 227 { 228 continue; 229 } 230 } 231 _fans.emplace_back( 232 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 233 234 updateFanHealth(*(_fans.back())); 235 } 236 } 237 238 // callback indicating a service went [on|off]line. 239 // Determine on/offline status, set all sensors for that service 240 // to new state 241 // 242 void System::tachSignalOffline(sdbusplus::message::message& msg, 243 SensorMapType const& sensorMap) 244 { 245 std::string serviceName, oldOwner, newOwner; 246 247 msg.read(serviceName); 248 msg.read(oldOwner); 249 msg.read(newOwner); 250 251 // true if sensor server came back online, false -> went offline 252 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 253 254 std::string stateStr(hasOwner ? "online" : "offline"); 255 getLogger().log(fmt::format("Changing sensors for service {} to {}", 256 serviceName, stateStr), 257 Logger::info); 258 259 auto sensorItr(sensorMap.find(serviceName)); 260 261 if (sensorItr != sensorMap.end()) 262 { 263 // set all sensors' owner state to not-owned 264 for (auto& sensor : sensorItr->second) 265 { 266 sensor->setOwner(hasOwner); 267 sensor->getFan().process(*sensor); 268 } 269 } 270 } 271 272 void System::updateFanHealth(const Fan& fan) 273 { 274 std::vector<bool> sensorStatus; 275 for (const auto& sensor : fan.sensors()) 276 { 277 sensorStatus.push_back(sensor->functional()); 278 } 279 280 _fanHealth[fan.getName()] = 281 std::make_tuple(fan.present(), std::move(sensorStatus)); 282 } 283 284 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 285 { 286 updateFanHealth(fan); 287 288 if (_powerState->isPowerOn() && !skipRulesCheck) 289 { 290 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 291 [this](auto& rule) { 292 rule->check(PowerRuleState::runtime, _fanHealth); 293 }); 294 } 295 } 296 297 void System::setFaultConfig(const json& jsonObj) 298 { 299 #ifdef MONITOR_USE_JSON 300 std::shared_ptr<PowerInterfaceBase> powerInterface = 301 std::make_shared<PowerInterface>(_thermalAlert); 302 303 PowerOffAction::PrePowerOffFunc func = 304 std::bind(std::mem_fn(&System::logShutdownError), this); 305 306 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 307 308 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 309 #endif 310 } 311 312 void System::powerStateChanged(bool powerStateOn) 313 { 314 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 315 fan->powerStateChanged(powerStateOn); 316 }); 317 318 if (powerStateOn) 319 { 320 if (!_started) 321 { 322 log<level::ERR>("No conf file found at power on"); 323 throw std::runtime_error("No conf file found at power on"); 324 } 325 326 // If no fan has its sensors on D-Bus, then there is a problem 327 // with the fan controller. Log an error and shut down. 328 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 329 return fan->numSensorsOnDBusAtPowerOn() == 0; 330 })) 331 { 332 handleOfflineFanController(); 333 return; 334 } 335 336 if (_sensorMatch.empty()) 337 { 338 subscribeSensorsToServices(); 339 } 340 341 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 342 [this](auto& rule) { 343 rule->check(PowerRuleState::atPgood, _fanHealth); 344 }); 345 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 346 [this](auto& rule) { 347 rule->check(PowerRuleState::runtime, _fanHealth); 348 }); 349 } 350 else 351 { 352 _thermalAlert.enabled(false); 353 354 // Cancel any in-progress power off actions 355 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 356 [this](auto& rule) { rule->cancel(); }); 357 } 358 } 359 360 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 361 { 362 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 363 364 getLogger().log( 365 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 366 sensor.name()), 367 Logger::error); 368 369 // In order to know if the event log should have a severity of error or 370 // informational, count the number of existing nonfunctional sensors and 371 // compare it to _numNonfuncSensorsBeforeError. 372 size_t nonfuncSensors = 0; 373 for (const auto& fan : _fans) 374 { 375 for (const auto& s : fan->sensors()) 376 { 377 // Don't count nonfunctional sensors that still have their 378 // error timer running as nonfunctional since they haven't 379 // had event logs created for those errors yet. 380 if (!s->functional() && !s->errorTimerRunning()) 381 { 382 nonfuncSensors++; 383 } 384 } 385 } 386 387 Severity severity = Severity::Error; 388 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 389 { 390 severity = Severity::Informational; 391 } 392 393 auto error = 394 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 395 fanPath, sensor.name(), severity); 396 397 auto sensorData = captureSensorData(); 398 error->commit(sensorData); 399 400 // Save the error so it can be committed again on a power off. 401 _lastError = std::move(error); 402 } 403 404 void System::fanMissingErrorTimerExpired(const Fan& fan) 405 { 406 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 407 408 getLogger().log( 409 fmt::format("Creating event log for missing fan {}", fanPath), 410 Logger::error); 411 412 auto error = std::make_unique<FanError>( 413 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 414 415 auto sensorData = captureSensorData(); 416 error->commit(sensorData); 417 418 // Save the error so it can be committed again on a power off. 419 _lastError = std::move(error); 420 } 421 422 void System::logShutdownError() 423 { 424 if (_lastError) 425 { 426 getLogger().log("Re-committing previous fan error before power off"); 427 428 // Still use the latest sensor data 429 auto sensorData = captureSensorData(); 430 _lastError->commit(sensorData, true); 431 } 432 } 433 434 json System::captureSensorData() 435 { 436 json data; 437 438 for (const auto& fan : _fans) 439 { 440 for (const auto& sensor : fan->sensors()) 441 { 442 json values; 443 values["present"] = fan->present(); 444 values["functional"] = sensor->functional(); 445 values["tach"] = sensor->getInput(); 446 if (sensor->hasTarget()) 447 { 448 values["target"] = sensor->getTarget(); 449 } 450 451 data["sensors"][sensor->name()] = values; 452 } 453 } 454 455 return data; 456 } 457 458 void System::handleOfflineFanController() 459 { 460 getLogger().log("The fan controller appears to be offline. Shutting down.", 461 Logger::error); 462 463 auto ffdc = collectHwmonFFDC(); 464 465 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 466 Severity::Critical}; 467 error.commit(ffdc, true); 468 469 PowerInterface::executeHardPowerOff(); 470 } 471 472 } // namespace phosphor::fan::monitor 473