1 #pragma once 2 3 #include <algorithm> 4 #include <experimental/filesystem> 5 #include <map> 6 #include <vector> 7 #include "device.hpp" 8 #include "gpio.hpp" 9 #include "pmbus.hpp" 10 #include "types.hpp" 11 12 namespace witherspoon 13 { 14 namespace power 15 { 16 17 //Error type, callout 18 using PartCallout = 19 std::tuple<ucd90160::extraAnalysisType, std::string>; 20 21 /** 22 * @class UCD90160 23 * 24 * This class implements fault analysis for the UCD90160 25 * power sequencer device. 26 * 27 */ 28 class UCD90160 : public Device 29 { 30 public: 31 32 UCD90160() = delete; 33 ~UCD90160() = default; 34 UCD90160(const UCD90160&) = delete; 35 UCD90160& operator=(const UCD90160&) = delete; 36 UCD90160(UCD90160&&) = default; 37 UCD90160& operator=(UCD90160&&) = default; 38 39 /** 40 * Constructor 41 * 42 * @param[in] instance - the device instance number 43 */ 44 UCD90160(size_t instance); 45 46 /** 47 * Analyzes the device for errors when the device is 48 * known to be in an error state. A log will be created. 49 */ 50 void onFailure() override; 51 52 /** 53 * Checks the device for errors and only creates a log 54 * if one is found. 55 */ 56 void analyze() override; 57 58 /** 59 * Clears faults in the device 60 */ 61 void clearFaults() override 62 { 63 } 64 65 private: 66 67 /** 68 * Reports an error for a GPU PGOOD failure 69 * 70 * @param[in] callout - the GPU callout string 71 */ 72 void gpuPGOODError(const std::string& callout); 73 74 /** 75 * Reports an error for a GPU OverTemp failure 76 * 77 * @param[in] callout - the GPU callout string 78 */ 79 void gpuOverTempError(const std::string& callout); 80 81 /** 82 * Given the device path for a chip, find its gpiochip 83 * path 84 * 85 * @param[in] path - device path, like 86 * /sys/devices/.../i2c-11/11-0064 87 * 88 * @return fs::path - The gpiochip path, like 89 * /dev/gpiochip1 90 */ 91 static std::experimental::filesystem::path findGPIODevice( 92 const std::experimental::filesystem::path& path); 93 94 /** 95 * Checks for VOUT faults on the device. 96 * 97 * This device can monitor voltages of its dependent 98 * devices, and VOUT faults are voltage faults 99 * on these devices. 100 * 101 * @return bool - true if an error log was created 102 */ 103 bool checkVOUTFaults(); 104 105 /** 106 * Checks for PGOOD faults on the device. 107 * 108 * This device can monitor the PGOOD signals of its dependent 109 * devices, and this check will look for faults of 110 * those PGOODs. 111 * 112 * @param[in] polling - If this is running while polling for errors, 113 * as opposing to analyzing a fail condition. 114 * 115 * @return bool - true if an error log was created 116 */ 117 bool checkPGOODFaults(bool polling); 118 119 /** 120 * Creates an error log when the device has an error 121 * but it isn't a PGOOD or voltage failure. 122 */ 123 void createPowerFaultLog(); 124 125 /** 126 * Reads the status_word register 127 * 128 * @return uint16_t - the register contents 129 */ 130 uint16_t readStatusWord(); 131 132 /** 133 * Reads the mfr_status register 134 * 135 * @return uint32_t - the register contents 136 */ 137 uint32_t readMFRStatus(); 138 139 /** 140 * Does any additional fault analysis based on the 141 * value of the extraAnalysisType field in the GPIOConfig 142 * entry. 143 * 144 * Used to get better callouts. 145 * 146 * @param[in] config - the GPIOConfig entry to use 147 * 148 * @return bool - true if a HW error was found, false else 149 */ 150 bool doExtraAnalysis(const ucd90160::GPIConfig& config); 151 152 /** 153 * Does additional fault analysis using GPIOs to 154 * specifically identify the failing part. 155 * 156 * Used when there are too many PGOOD inputs for 157 * the UCD90160 to handle, so just a summary bit 158 * is wired into the chip, and then the specific 159 * fault GPIOs are off of a different GPIO device, 160 * like an IO expander. 161 * 162 * @param[in] type - the type of analysis to do 163 * 164 * @return bool - true if a HW error was found, false else 165 */ 166 bool doGPIOAnalysis(ucd90160::extraAnalysisType type); 167 168 /** 169 * Says if we've already logged a Vout fault 170 * 171 * The policy is only 1 of the same error will 172 * be logged for the duration of a class instance. 173 * 174 * @param[in] page - the page to check 175 * 176 * @return bool - if we've already logged a fault against 177 * this page 178 */ 179 inline bool isVoutFaultLogged(uint32_t page) const 180 { 181 return std::find(voutErrors.begin(), 182 voutErrors.end(), 183 page) != voutErrors.end(); 184 } 185 186 /** 187 * Saves that a Vout fault has been logged 188 * 189 * @param[in] page - the page the error was logged against 190 */ 191 inline void setVoutFaultLogged(uint32_t page) 192 { 193 voutErrors.push_back(page); 194 } 195 196 /** 197 * Says if we've already logged a PGOOD fault 198 * 199 * The policy is only 1 of the same errors will 200 * be logged for the duration of a class instance. 201 * 202 * @param[in] input - the input to check 203 * 204 * @return bool - if we've already logged a fault against 205 * this input 206 */ 207 inline bool isPGOODFaultLogged(uint32_t input) const 208 { 209 return std::find(pgoodErrors.begin(), 210 pgoodErrors.end(), 211 input) != pgoodErrors.end(); 212 } 213 214 /** 215 * Says if we've already logged a specific fault 216 * against a specific part 217 * 218 * @param[in] callout - error type and name tuple 219 * 220 * @return bool - if we've already logged this fault 221 * against this part 222 */ 223 inline bool isPartCalledOut(const PartCallout& callout) const 224 { 225 return std::find(callouts.begin(), 226 callouts.end(), 227 callout) != callouts.end(); 228 } 229 230 /** 231 * Saves that a PGOOD fault has been logged 232 * 233 * @param[in] input - the input the error was logged against 234 */ 235 inline void setPGOODFaultLogged(uint32_t input) 236 { 237 pgoodErrors.push_back(input); 238 } 239 240 /** 241 * Saves that a specific fault on a specific part has been done 242 * 243 * @param[in] callout - error type and name tuple 244 */ 245 inline void setPartCallout(const PartCallout& callout) 246 { 247 callouts.push_back(callout); 248 } 249 250 /** 251 * List of pages that Vout errors have 252 * already been logged against 253 */ 254 std::vector<uint32_t> voutErrors; 255 256 /** 257 * List of inputs that PGOOD errors have 258 * already been logged against 259 */ 260 std::vector<uint32_t> pgoodErrors; 261 262 /** 263 * List of callouts that already been done 264 */ 265 std::vector<PartCallout> callouts; 266 267 /** 268 * The read/write interface to this hardware 269 */ 270 pmbus::PMBus interface; 271 272 /** 273 * A map of GPI pin IDs to the GPIO object 274 * used to access them 275 */ 276 std::map<size_t, std::unique_ptr<gpio::GPIO>> gpios; 277 278 /** 279 * Keeps track of device access errors to avoid repeatedly 280 * logging errors for bad hardware 281 */ 282 bool accessError = false; 283 284 /** 285 * Keeps track of GPIO access errors when doing the in depth 286 * PGOOD fault analysis to avoid repeatedly logging errors 287 * for bad hardware 288 */ 289 bool gpioAccessError = false; 290 291 /** 292 * The path to the GPIO device used to read 293 * the GPI (PGOOD) status 294 */ 295 std::experimental::filesystem::path gpioDevice; 296 297 /** 298 * Map of device instance to the instance specific data 299 */ 300 static const ucd90160::DeviceMap deviceMap; 301 }; 302 303 } 304 } 305