1 /** 2 * Copyright © 2021 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #include "utility.hpp" 24 #ifdef MONITOR_USE_JSON 25 #include "json_parser.hpp" 26 #endif 27 28 #include "config.h" 29 30 #include "hwmon_ffdc.hpp" 31 32 #include <nlohmann/json.hpp> 33 #include <phosphor-logging/log.hpp> 34 #include <sdbusplus/bus.hpp> 35 #include <sdeventplus/event.hpp> 36 #include <sdeventplus/source/signal.hpp> 37 38 namespace phosphor::fan::monitor 39 { 40 41 using json = nlohmann::json; 42 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 43 44 using namespace phosphor::logging; 45 46 System::System(Mode mode, sdbusplus::bus::bus& bus, 47 const sdeventplus::Event& event) : 48 _mode(mode), 49 _bus(bus), _event(event), 50 _powerState(std::make_unique<PGoodState>( 51 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 52 std::placeholders::_1))), 53 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 54 {} 55 56 void System::start() 57 { 58 _started = true; 59 json jsonObj = json::object(); 60 #ifdef MONITOR_USE_JSON 61 auto confFile = 62 fan::JsonConfig::getConfFile(_bus, confAppName, confFileName); 63 jsonObj = fan::JsonConfig::load(confFile); 64 #endif 65 // Retrieve and set trust groups within the trust manager 66 setTrustMgr(getTrustGroups(jsonObj)); 67 // Retrieve fan definitions and create fan objects to be monitored 68 setFans(getFanDefinitions(jsonObj)); 69 setFaultConfig(jsonObj); 70 log<level::INFO>("Configuration loaded"); 71 72 if (_powerState->isPowerOn()) 73 { 74 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 75 [this](auto& rule) { 76 rule->check(PowerRuleState::runtime, _fanHealth); 77 }); 78 } 79 80 auto sensorMap = buildNameOwnerChangedMap(); 81 82 namespace match = sdbusplus::bus::match; 83 84 // for each service, register a callback handler for nameOwnerChanged 85 for (const auto& service_itr : sensorMap) 86 { 87 _sensorMatch.push_back(std::make_unique<match::match>( 88 _bus, match::rules::nameOwnerChanged(service_itr.first), 89 std::bind(&System::tachSignalOffline, this, std::placeholders::_1, 90 sensorMap))); 91 } 92 } 93 94 SensorMapType System::buildNameOwnerChangedMap() const 95 { 96 SensorMapType sensorMap; 97 98 // build a list of all interfaces, always including the value interface 99 // using set automatically guards against duplicates 100 std::set<std::string> unique_interfaces{util::FAN_SENSOR_VALUE_INTF}; 101 102 for (const auto& fan : _fans) 103 { 104 for (const auto& sensor : fan->sensors()) 105 { 106 unique_interfaces.insert(sensor->getInterface()); 107 } 108 } 109 // convert them to vector to pass into getSubTreeRaw 110 std::vector<std::string> interfaces(unique_interfaces.begin(), 111 unique_interfaces.end()); 112 113 // get service information for all service names that are 114 // hosting these interfaces 115 auto serviceObjects = 116 util::SDBusPlus::getSubTreeRaw(_bus, FAN_SENSOR_PATH, interfaces, 0); 117 118 for (const auto& fan : _fans) 119 { 120 // For every sensor in each fan 121 for (const auto& sensor : fan->sensors()) 122 { 123 const auto itServ = serviceObjects.find(sensor->name()); 124 125 if (serviceObjects.end() == itServ || itServ->second.empty()) 126 { 127 getLogger().log( 128 fmt::format("Fan sensor entry {} not found in D-Bus", 129 sensor->name()), 130 Logger::error); 131 continue; 132 } 133 134 for (const auto& [serviceName, unused] : itServ->second) 135 { 136 // map its service name to the sensor 137 sensorMap[serviceName].insert(sensor); 138 } 139 } 140 } 141 142 return sensorMap; 143 } 144 145 void System::sighupHandler(sdeventplus::source::Signal&, 146 const struct signalfd_siginfo*) 147 { 148 try 149 { 150 json jsonObj = json::object(); 151 #ifdef MONITOR_USE_JSON 152 jsonObj = getJsonObj(_bus); 153 #endif 154 auto trustGrps = getTrustGroups(jsonObj); 155 auto fanDefs = getFanDefinitions(jsonObj); 156 // Set configured trust groups 157 setTrustMgr(trustGrps); 158 // Clear/set configured fan definitions 159 _fans.clear(); 160 _fanHealth.clear(); 161 setFans(fanDefs); 162 setFaultConfig(jsonObj); 163 log<level::INFO>("Configuration reloaded successfully"); 164 165 if (_powerState->isPowerOn()) 166 { 167 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 168 [this](auto& rule) { 169 rule->check(PowerRuleState::runtime, _fanHealth); 170 }); 171 } 172 } 173 catch (std::runtime_error& re) 174 { 175 log<level::ERR>("Error reloading config, no config changes made", 176 entry("LOAD_ERROR=%s", re.what())); 177 } 178 } 179 180 const std::vector<CreateGroupFunction> 181 System::getTrustGroups(const json& jsonObj) 182 { 183 #ifdef MONITOR_USE_JSON 184 return getTrustGrps(jsonObj); 185 #else 186 return trustGroups; 187 #endif 188 } 189 190 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 191 { 192 _trust = std::make_unique<trust::Manager>(groupFuncs); 193 } 194 195 const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj) 196 { 197 #ifdef MONITOR_USE_JSON 198 return getFanDefs(jsonObj); 199 #else 200 return fanDefinitions; 201 #endif 202 } 203 204 void System::setFans(const std::vector<FanDefinition>& fanDefs) 205 { 206 for (const auto& fanDef : fanDefs) 207 { 208 // Check if a condition exists on the fan 209 auto condition = std::get<conditionField>(fanDef); 210 if (condition) 211 { 212 // Condition exists, skip adding fan if it fails 213 if (!(*condition)(_bus)) 214 { 215 continue; 216 } 217 } 218 _fans.emplace_back( 219 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 220 221 updateFanHealth(*(_fans.back())); 222 } 223 } 224 225 // callback indicating a service went [on|off]line. 226 // Determine on/offline status, set all sensors for that service 227 // to new state 228 // 229 void System::tachSignalOffline(sdbusplus::message::message& msg, 230 SensorMapType const& sensorMap) 231 { 232 std::string serviceName, oldOwner, newOwner; 233 234 msg.read(serviceName); 235 msg.read(oldOwner); 236 msg.read(newOwner); 237 238 // true if sensor server came back online, false -> went offline 239 bool hasOwner = !newOwner.empty() && oldOwner.empty(); 240 241 std::string stateStr(hasOwner ? "online" : "offline"); 242 getLogger().log(fmt::format("Changing sensors for service {} to {}", 243 serviceName, stateStr), 244 Logger::info); 245 246 auto sensorItr(sensorMap.find(serviceName)); 247 248 if (sensorItr != sensorMap.end()) 249 { 250 // set all sensors' owner state to not-owned 251 for (auto& sensor : sensorItr->second) 252 { 253 sensor->setOwner(hasOwner); 254 sensor->getFan().process(*sensor); 255 } 256 } 257 } 258 259 void System::updateFanHealth(const Fan& fan) 260 { 261 std::vector<bool> sensorStatus; 262 for (const auto& sensor : fan.sensors()) 263 { 264 sensorStatus.push_back(sensor->functional()); 265 } 266 267 _fanHealth[fan.getName()] = 268 std::make_tuple(fan.present(), std::move(sensorStatus)); 269 } 270 271 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 272 { 273 updateFanHealth(fan); 274 275 if (_powerState->isPowerOn() && !skipRulesCheck) 276 { 277 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 278 [this](auto& rule) { 279 rule->check(PowerRuleState::runtime, _fanHealth); 280 }); 281 } 282 } 283 284 void System::setFaultConfig(const json& jsonObj) 285 { 286 #ifdef MONITOR_USE_JSON 287 std::shared_ptr<PowerInterfaceBase> powerInterface = 288 std::make_shared<PowerInterface>(_thermalAlert); 289 290 PowerOffAction::PrePowerOffFunc func = 291 std::bind(std::mem_fn(&System::logShutdownError), this); 292 293 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 294 295 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 296 #endif 297 } 298 299 void System::powerStateChanged(bool powerStateOn) 300 { 301 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 302 fan->powerStateChanged(powerStateOn); 303 }); 304 305 if (powerStateOn) 306 { 307 if (!_started) 308 { 309 log<level::ERR>("No conf file found at power on"); 310 throw std::runtime_error("No conf file found at power on"); 311 } 312 313 // If no fan has its sensors on D-Bus, then there is a problem 314 // with the fan controller. Log an error and shut down. 315 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 316 return fan->numSensorsOnDBusAtPowerOn() == 0; 317 })) 318 { 319 handleOfflineFanController(); 320 return; 321 } 322 323 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 324 [this](auto& rule) { 325 rule->check(PowerRuleState::atPgood, _fanHealth); 326 }); 327 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 328 [this](auto& rule) { 329 rule->check(PowerRuleState::runtime, _fanHealth); 330 }); 331 } 332 else 333 { 334 _thermalAlert.enabled(false); 335 336 // Cancel any in-progress power off actions 337 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 338 [this](auto& rule) { rule->cancel(); }); 339 } 340 } 341 342 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 343 { 344 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 345 346 getLogger().log( 347 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 348 sensor.name()), 349 Logger::error); 350 351 // In order to know if the event log should have a severity of error or 352 // informational, count the number of existing nonfunctional sensors and 353 // compare it to _numNonfuncSensorsBeforeError. 354 size_t nonfuncSensors = 0; 355 for (const auto& fan : _fans) 356 { 357 for (const auto& s : fan->sensors()) 358 { 359 // Don't count nonfunctional sensors that still have their 360 // error timer running as nonfunctional since they haven't 361 // had event logs created for those errors yet. 362 if (!s->functional() && !s->errorTimerRunning()) 363 { 364 nonfuncSensors++; 365 } 366 } 367 } 368 369 Severity severity = Severity::Error; 370 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 371 { 372 severity = Severity::Informational; 373 } 374 375 auto error = 376 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 377 fanPath, sensor.name(), severity); 378 379 auto sensorData = captureSensorData(); 380 error->commit(sensorData); 381 382 // Save the error so it can be committed again on a power off. 383 _lastError = std::move(error); 384 } 385 386 void System::fanMissingErrorTimerExpired(const Fan& fan) 387 { 388 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 389 390 getLogger().log( 391 fmt::format("Creating event log for missing fan {}", fanPath), 392 Logger::error); 393 394 auto error = std::make_unique<FanError>( 395 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 396 397 auto sensorData = captureSensorData(); 398 error->commit(sensorData); 399 400 // Save the error so it can be committed again on a power off. 401 _lastError = std::move(error); 402 } 403 404 void System::logShutdownError() 405 { 406 if (_lastError) 407 { 408 getLogger().log("Re-committing previous fan error before power off"); 409 410 // Still use the latest sensor data 411 auto sensorData = captureSensorData(); 412 _lastError->commit(sensorData, true); 413 } 414 } 415 416 json System::captureSensorData() 417 { 418 json data; 419 420 for (const auto& fan : _fans) 421 { 422 for (const auto& sensor : fan->sensors()) 423 { 424 json values; 425 values["present"] = fan->present(); 426 values["functional"] = sensor->functional(); 427 values["tach"] = sensor->getInput(); 428 if (sensor->hasTarget()) 429 { 430 values["target"] = sensor->getTarget(); 431 } 432 433 data["sensors"][sensor->name()] = values; 434 } 435 } 436 437 return data; 438 } 439 440 void System::handleOfflineFanController() 441 { 442 getLogger().log("The fan controller appears to be offline. Shutting down.", 443 Logger::error); 444 445 auto ffdc = collectHwmonFFDC(); 446 447 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 448 Severity::Critical}; 449 error.commit(ffdc, true); 450 451 PowerInterface::executeHardPowerOff(); 452 } 453 454 } // namespace phosphor::fan::monitor 455