1 /** 2 * Copyright © 2020 IBM Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #pragma once 17 18 #include "fan.hpp" 19 #include "fan_error.hpp" 20 #include "power_off_rule.hpp" 21 #include "power_state.hpp" 22 #include "tach_sensor.hpp" 23 #include "trust_manager.hpp" 24 #include "types.hpp" 25 26 #include <nlohmann/json.hpp> 27 #include <sdbusplus/bus.hpp> 28 #include <sdeventplus/event.hpp> 29 #include <sdeventplus/source/event.hpp> 30 #include <sdeventplus/source/signal.hpp> 31 32 #include <memory> 33 #include <optional> 34 #include <vector> 35 36 namespace phosphor::fan::monitor 37 { 38 39 using json = nlohmann::json; 40 41 // Mapping from service name to sensor 42 using SensorMapType = 43 std::map<std::string, std::set<std::shared_ptr<TachSensor>>>; 44 45 class System 46 { 47 public: 48 System() = delete; 49 ~System() = default; 50 System(const System&) = delete; 51 System(System&&) = delete; 52 System& operator=(const System&) = delete; 53 System& operator=(System&&) = delete; 54 55 /** 56 * Constructor 57 * 58 * @param[in] mode - mode of fan monitor 59 * @param[in] bus - sdbusplus bus object 60 * @param[in] event - event loop reference 61 */ 62 System(Mode mode, sdbusplus::bus_t& bus, const sdeventplus::Event& event); 63 64 /** 65 * @brief Callback function to handle receiving a HUP signal to reload the 66 * JSON configuration. 67 */ 68 void sighupHandler(sdeventplus::source::Signal&, 69 const struct signalfd_siginfo*); 70 71 /** 72 * @brief Called from the fan when it changes either 73 * present or functional status to update the 74 * fan health map. 75 * 76 * @param[in] fan - The fan that changed 77 * @param[in] skipRulesCheck - If the rules checks should be done now. 78 */ 79 void fanStatusChange(const Fan& fan, bool skipRulesCheck = false); 80 81 /** 82 * @brief Called when a fan sensor's error timer expires, which 83 * happens when the sensor has been nonfunctional for a 84 * certain amount of time. An event log will be created. 85 * 86 * @param[in] fan - The parent fan of the sensor 87 * @param[in] sensor - The faulted sensor 88 */ 89 void sensorErrorTimerExpired(const Fan& fan, const TachSensor& sensor); 90 91 /** 92 * @brief Called when the timer that starts when a fan is missing 93 * has expired so an event log needs to be created. 94 * 95 * @param[in] fan - The missing fan. 96 */ 97 void fanMissingErrorTimerExpired(const Fan& fan); 98 99 /** 100 * @brief Called by the power off actions to log an error when there is 101 * a power off due to fan problems. 102 * 103 * The error it logs is just the last fan error that occurred. 104 */ 105 void logShutdownError(); 106 107 /** 108 * @brief Returns true if power is on 109 */ 110 bool isPowerOn() const 111 { 112 return _powerState->isPowerOn(); 113 } 114 115 /** 116 * @brief tests the presence of Inventory and calls load() if present, else 117 * waits for Inventory asynchronously and has a callback to load() when 118 * present 119 */ 120 void start(); 121 122 /** 123 * @brief Parses and populates the fan monitor trust groups and list of fans 124 */ 125 void load(); 126 127 /** 128 * @brief Callback function to handle receiving a USR1 signal to dump 129 * debug data to a file. 130 */ 131 void dumpDebugData(sdeventplus::source::Signal&, 132 const struct signalfd_siginfo*); 133 134 private: 135 /** 136 * @brief Callback from D-Bus when Inventory service comes online 137 * 138 * @param[in] msg - Service details. 139 */ 140 void inventoryOnlineCb(sdbusplus::message_t& msg); 141 142 /** 143 * @brief Create a BMC Dump 144 */ 145 void createBmcDump() const; 146 147 /* The mode of fan monitor */ 148 Mode _mode; 149 150 /* The sdbusplus bus object */ 151 sdbusplus::bus_t& _bus; 152 153 /* The event loop reference */ 154 const sdeventplus::Event& _event; 155 156 /* Trust manager of trust groups */ 157 std::unique_ptr<phosphor::fan::trust::Manager> _trust; 158 159 /* match object to detect Inventory service */ 160 std::unique_ptr<sdbusplus::bus::match_t> _inventoryMatch; 161 162 /* List of fan objects to monitor */ 163 std::vector<std::unique_ptr<Fan>> _fans; 164 165 /** 166 * @brief The latest health of all the fans 167 */ 168 FanHealth _fanHealth; 169 170 /** 171 * @brief The object to watch the power state 172 */ 173 std::unique_ptr<PowerState> _powerState; 174 175 /** 176 * @brief The power off rules, for shutting down the system 177 * due to fan failures. 178 */ 179 std::vector<std::unique_ptr<PowerOffRule>> _powerOffRules; 180 181 /** 182 * @brief The number of concurrently nonfunctional fan sensors 183 * there must be for an event log created due to a 184 * nonfunctional fan sensor to have an Error severity as 185 * opposed to an Informational one. 186 */ 187 std::optional<size_t> _numNonfuncSensorsBeforeError; 188 189 /** 190 * @brief The most recently committed fan error. 191 */ 192 std::unique_ptr<FanError> _lastError; 193 194 /** 195 * @brief The thermal alert D-Bus object 196 */ 197 ThermalAlertObject _thermalAlert; 198 199 /** 200 * @brief The tach sensors D-Bus match objects 201 */ 202 std::vector<std::unique_ptr<sdbusplus::bus::match_t>> _sensorMatch; 203 204 /** 205 * @brief true if config files have been loaded 206 */ 207 bool _loaded = false; 208 209 /** 210 * @brief The name of the dump file 211 */ 212 static const std::string dumpFile; 213 214 /** 215 * @brief Captures tach sensor data as JSON for use in 216 * fan fault and fan missing event logs. 217 * 218 * @return json - The JSON data 219 */ 220 json captureSensorData(); 221 222 /** 223 * @brief creates a subscription (service->sensor) to take sensors 224 * on/offline when D-Bus starts/stops updating values 225 * 226 */ 227 void subscribeSensorsToServices(); 228 229 /** 230 * @brief Retrieve the configured trust groups 231 * 232 * @param[in] jsonObj - JSON object to parse from 233 * 234 * @return List of functions applied on trust groups 235 */ 236 const std::vector<CreateGroupFunction> getTrustGroups(const json& jsonObj); 237 238 /** 239 * @brief Set the trust manager's list of trust group functions 240 * 241 * @param[in] groupFuncs - list of trust group functions 242 */ 243 void setTrustMgr(const std::vector<CreateGroupFunction>& groupFuncs); 244 245 /** 246 * @brief Retrieve the configured fan definitions 247 * 248 * @param[in] jsonObj - JSON object to parse from 249 * 250 * @return List of fan definition data on the fans configured 251 */ 252 const std::vector<FanDefinition> getFanDefinitions(const json& jsonObj); 253 254 /** 255 * @brief Set the list of fans to be monitored 256 * 257 * @param[in] fanDefs - list of fan definitions to create fans monitored 258 */ 259 void setFans(const std::vector<FanDefinition>& fanDefs); 260 261 /** 262 * @brief Updates the fan health map entry for the fan passed in 263 * 264 * @param[in] fan - The fan to update the health map with 265 */ 266 void updateFanHealth(const Fan& fan); 267 268 /** 269 * @brief callback when a tach sensor signal goes offline 270 * 271 * @param[in] msg - D-Bus message containing details (inc. service name) 272 * 273 * @param[in] sensorMap - map providing sensor access for each service 274 */ 275 void tachSignalOffline(sdbusplus::message_t& msg, 276 const SensorMapType& sensorMap); 277 278 /** 279 * @brief The function that runs when the power state changes 280 * 281 * @param[in] powerStateOn - If power is now on or not 282 */ 283 void powerStateChanged(bool powerStateOn); 284 285 /** 286 * @brief Reads the fault configuration from the JSON config 287 * file, such as the power off rule configuration. 288 * 289 * @param[in] jsonObj - JSON object to parse from 290 */ 291 void setFaultConfig(const json& jsonObj); 292 293 /** 294 * @brief Log an error and shut down due to an offline fan controller 295 */ 296 void handleOfflineFanController(); 297 }; 298 299 } // namespace phosphor::fan::monitor 300