1 /** 2 * Copyright © 2020 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include "system.hpp" 17 18 #include "fan.hpp" 19 #include "fan_defs.hpp" 20 #include "tach_sensor.hpp" 21 #include "trust_manager.hpp" 22 #include "types.hpp" 23 #ifdef MONITOR_USE_JSON 24 #include "json_parser.hpp" 25 #endif 26 27 #include "config.h" 28 29 #include "hwmon_ffdc.hpp" 30 31 #include <nlohmann/json.hpp> 32 #include <phosphor-logging/log.hpp> 33 #include <sdbusplus/bus.hpp> 34 #include <sdeventplus/event.hpp> 35 #include <sdeventplus/source/signal.hpp> 36 37 namespace phosphor::fan::monitor 38 { 39 40 using json = nlohmann::json; 41 using Severity = sdbusplus::xyz::openbmc_project::Logging::server::Entry::Level; 42 43 using namespace phosphor::logging; 44 45 System::System(Mode mode, sdbusplus::bus::bus& bus, 46 const sdeventplus::Event& event) : 47 _mode(mode), 48 _bus(bus), _event(event), 49 _powerState(std::make_unique<PGoodState>( 50 bus, std::bind(std::mem_fn(&System::powerStateChanged), this, 51 std::placeholders::_1))), 52 _thermalAlert(bus, THERMAL_ALERT_OBJPATH) 53 {} 54 55 void System::start() 56 { 57 _started = true; 58 json jsonObj = json::object(); 59 #ifdef MONITOR_USE_JSON 60 auto confFile = 61 fan::JsonConfig::getConfFile(_bus, confAppName, confFileName); 62 jsonObj = fan::JsonConfig::load(confFile); 63 #endif 64 // Retrieve and set trust groups within the trust manager 65 setTrustMgr(getTrustGroups(jsonObj)); 66 // Retrieve fan definitions and create fan objects to be monitored 67 setFans(getFanDefinitions(jsonObj)); 68 setFaultConfig(jsonObj); 69 log<level::INFO>("Configuration loaded"); 70 71 if (_powerState->isPowerOn()) 72 { 73 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 74 [this](auto& rule) { 75 rule->check(PowerRuleState::runtime, _fanHealth); 76 }); 77 } 78 } 79 80 void System::sighupHandler(sdeventplus::source::Signal&, 81 const struct signalfd_siginfo*) 82 { 83 try 84 { 85 json jsonObj = json::object(); 86 #ifdef MONITOR_USE_JSON 87 jsonObj = getJsonObj(_bus); 88 #endif 89 auto trustGrps = getTrustGroups(jsonObj); 90 auto fanDefs = getFanDefinitions(jsonObj); 91 // Set configured trust groups 92 setTrustMgr(trustGrps); 93 // Clear/set configured fan definitions 94 _fans.clear(); 95 _fanHealth.clear(); 96 setFans(fanDefs); 97 setFaultConfig(jsonObj); 98 log<level::INFO>("Configuration reloaded successfully"); 99 100 if (_powerState->isPowerOn()) 101 { 102 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 103 [this](auto& rule) { 104 rule->check(PowerRuleState::runtime, _fanHealth); 105 }); 106 } 107 } 108 catch (std::runtime_error& re) 109 { 110 log<level::ERR>("Error reloading config, no config changes made", 111 entry("LOAD_ERROR=%s", re.what())); 112 } 113 } 114 115 const std::vector<CreateGroupFunction> 116 System::getTrustGroups(const json& jsonObj) 117 { 118 #ifdef MONITOR_USE_JSON 119 return getTrustGrps(jsonObj); 120 #else 121 return trustGroups; 122 #endif 123 } 124 125 void System::setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs) 126 { 127 _trust = std::make_unique<trust::Manager>(groupFuncs); 128 } 129 130 const std::vector<FanDefinition> System::getFanDefinitions(const json& jsonObj) 131 { 132 #ifdef MONITOR_USE_JSON 133 return getFanDefs(jsonObj); 134 #else 135 return fanDefinitions; 136 #endif 137 } 138 139 void System::setFans(const std::vector<FanDefinition>& fanDefs) 140 { 141 for (const auto& fanDef : fanDefs) 142 { 143 // Check if a condition exists on the fan 144 auto condition = std::get<conditionField>(fanDef); 145 if (condition) 146 { 147 // Condition exists, skip adding fan if it fails 148 if (!(*condition)(_bus)) 149 { 150 continue; 151 } 152 } 153 _fans.emplace_back( 154 std::make_unique<Fan>(_mode, _bus, _event, _trust, fanDef, *this)); 155 156 updateFanHealth(*(_fans.back())); 157 } 158 } 159 160 void System::updateFanHealth(const Fan& fan) 161 { 162 std::vector<bool> sensorStatus; 163 for (const auto& sensor : fan.sensors()) 164 { 165 sensorStatus.push_back(sensor->functional()); 166 } 167 168 _fanHealth[fan.getName()] = 169 std::make_tuple(fan.present(), std::move(sensorStatus)); 170 } 171 172 void System::fanStatusChange(const Fan& fan, bool skipRulesCheck) 173 { 174 updateFanHealth(fan); 175 176 if (_powerState->isPowerOn() && !skipRulesCheck) 177 { 178 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 179 [this](auto& rule) { 180 rule->check(PowerRuleState::runtime, _fanHealth); 181 }); 182 } 183 } 184 185 void System::setFaultConfig(const json& jsonObj) 186 { 187 #ifdef MONITOR_USE_JSON 188 std::shared_ptr<PowerInterfaceBase> powerInterface = 189 std::make_shared<PowerInterface>(_thermalAlert); 190 191 PowerOffAction::PrePowerOffFunc func = 192 std::bind(std::mem_fn(&System::logShutdownError), this); 193 194 _powerOffRules = getPowerOffRules(jsonObj, powerInterface, func); 195 196 _numNonfuncSensorsBeforeError = getNumNonfuncRotorsBeforeError(jsonObj); 197 #endif 198 } 199 200 void System::powerStateChanged(bool powerStateOn) 201 { 202 std::for_each(_fans.begin(), _fans.end(), [powerStateOn](auto& fan) { 203 fan->powerStateChanged(powerStateOn); 204 }); 205 206 if (powerStateOn) 207 { 208 if (!_started) 209 { 210 log<level::ERR>("No conf file found at power on"); 211 throw std::runtime_error("No conf file found at power on"); 212 } 213 214 // If no fan has its sensors on D-Bus, then there is a problem 215 // with the fan controller. Log an error and shut down. 216 if (std::all_of(_fans.begin(), _fans.end(), [](const auto& fan) { 217 return fan->numSensorsOnDBusAtPowerOn() == 0; 218 })) 219 { 220 handleOfflineFanController(); 221 return; 222 } 223 224 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 225 [this](auto& rule) { 226 rule->check(PowerRuleState::atPgood, _fanHealth); 227 }); 228 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 229 [this](auto& rule) { 230 rule->check(PowerRuleState::runtime, _fanHealth); 231 }); 232 } 233 else 234 { 235 _thermalAlert.enabled(false); 236 237 // Cancel any in-progress power off actions 238 std::for_each(_powerOffRules.begin(), _powerOffRules.end(), 239 [this](auto& rule) { rule->cancel(); }); 240 } 241 } 242 243 void System::sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor) 244 { 245 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 246 247 getLogger().log( 248 fmt::format("Creating event log for faulted fan {} sensor {}", fanPath, 249 sensor.name()), 250 Logger::error); 251 252 // In order to know if the event log should have a severity of error or 253 // informational, count the number of existing nonfunctional sensors and 254 // compare it to _numNonfuncSensorsBeforeError. 255 size_t nonfuncSensors = 0; 256 for (const auto& fan : _fans) 257 { 258 for (const auto& s : fan->sensors()) 259 { 260 // Don't count nonfunctional sensors that still have their 261 // error timer running as nonfunctional since they haven't 262 // had event logs created for those errors yet. 263 if (!s->functional() && !s->errorTimerRunning()) 264 { 265 nonfuncSensors++; 266 } 267 } 268 } 269 270 Severity severity = Severity::Error; 271 if (nonfuncSensors < _numNonfuncSensorsBeforeError) 272 { 273 severity = Severity::Informational; 274 } 275 276 auto error = 277 std::make_unique<FanError>("xyz.openbmc_project.Fan.Error.Fault", 278 fanPath, sensor.name(), severity); 279 280 auto sensorData = captureSensorData(); 281 error->commit(sensorData); 282 283 // Save the error so it can be committed again on a power off. 284 _lastError = std::move(error); 285 } 286 287 void System::fanMissingErrorTimerExpired(const Fan& fan) 288 { 289 std::string fanPath{util::INVENTORY_PATH + fan.getName()}; 290 291 getLogger().log( 292 fmt::format("Creating event log for missing fan {}", fanPath), 293 Logger::error); 294 295 auto error = std::make_unique<FanError>( 296 "xyz.openbmc_project.Fan.Error.Missing", fanPath, "", Severity::Error); 297 298 auto sensorData = captureSensorData(); 299 error->commit(sensorData); 300 301 // Save the error so it can be committed again on a power off. 302 _lastError = std::move(error); 303 } 304 305 void System::logShutdownError() 306 { 307 if (_lastError) 308 { 309 getLogger().log("Re-committing previous fan error before power off"); 310 311 // Still use the latest sensor data 312 auto sensorData = captureSensorData(); 313 _lastError->commit(sensorData, true); 314 } 315 } 316 317 json System::captureSensorData() 318 { 319 json data; 320 321 for (const auto& fan : _fans) 322 { 323 for (const auto& sensor : fan->sensors()) 324 { 325 json values; 326 values["present"] = fan->present(); 327 values["functional"] = sensor->functional(); 328 values["tach"] = sensor->getInput(); 329 if (sensor->hasTarget()) 330 { 331 values["target"] = sensor->getTarget(); 332 } 333 334 data["sensors"][sensor->name()] = values; 335 } 336 } 337 338 return data; 339 } 340 341 void System::handleOfflineFanController() 342 { 343 getLogger().log("The fan controller appears to be offline. Shutting down.", 344 Logger::error); 345 346 auto ffdc = collectHwmonFFDC(); 347 348 FanError error{"xyz.openbmc_project.Fan.Error.FanControllerOffline", 349 Severity::Critical}; 350 error.commit(ffdc, true); 351 352 PowerInterface::executeHardPowerOff(); 353 } 354 355 } // namespace phosphor::fan::monitor 356